Public
Script
Val Town is a social website to write and deploy JavaScript.
Build APIs and schedule functions from your browser.
Readme

This module provides functions to convert HTML to various formats such as Markdown, Pug, and Turndown. It also includes utilities to fetch HTML from a URL, extract metadata, and clean HTML by removing unwanted selectors. Dependencies include Cheerio for HTML parsing, JSDOM for DOM manipulation, and Turndown for converting HTML to Markdown.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
// import { fetch } from "https://esm.town/v/std/fetch";
import { convertHtmlToMarkdown } from 'npm:@yawnxyz/dom-to-semantic-markdown';
import { JSDOM } from 'npm:jsdom';
import { cheerio } from "https://deno.land/x/cheerio@1.0.7/mod.ts";
import * as turndownPluginGfm from "npm:turndown";
// import { plugin } from "npm:turndown-plugin-gfm";
import html2pug from 'npm:html2pug';
export async function getHtmlFromUrl(url) {
try {
const response = await fetch(url);
const html = await response.text();
// Load the HTML into Cheerio
const $ = cheerio.load(html);
return $.html();
} catch (error) {
console.error("Error fetching the URL:", error);
return null;
}
}
// todo, pass in html optional, and skip fetch?
export async function getMetadataFromUrl(url) {
const response = await fetch(url, {
method: "GET",
headers: new Headers({
"User-Agent":
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8",
"Accept-Language": "en-US,en;q=0.5",
"Sec-Fetch-Site": "cross-site",
"Sec-Fetch-Mode": "navigate",
"Sec-Fetch-User": "?1",
"Sec-Fetch-Dest": "document",
"Referer": "https://www.google.com/",
"sec-ch-ua": `"Not A Brand";v="99", "Google Chrome";v="91", "Chromium";v="91"`,
"sec-ch-ua-mobile": "?0",
"sec-ch-ua-platform": `"macOS"`,
"Upgrade-Insecure-Requests": "1",
}),
});
const html = await response.text();
const dom = new JSDOM(html);
const doc = new dom.window.DOMParser().parseFromString(html, "text/html");
const metadata: { [key: string]: string } = {};
const title = doc.getElementsByTagName("title")[0]?.textContent || "";
metadata.title = title;
const metaTags = doc.getElementsByTagName("meta");
for (const metaTag of metaTags) {
const name = metaTag.getAttribute("name") || metaTag.getAttribute("property");
const content = metaTag.getAttribute("content");
if (name && content) {
metadata[name] = content;
}
}
const cleanedUrl = url.endsWith("/") ? url.slice(0, -1) : url;
const slug = new URL(cleanedUrl).pathname.split("/").pop() || "";
metadata.slug = slug;
return metadata;
}
// Function to remove unwanted selectors from the HTML and return the cleaned HTML
export function removeSelectorsFromHtml(html, removeSelectors = "style, script, link, noscript, frame, iframe, comment") {
const $ = cheerio.load(html); // Load the HTML into Cheerio
$(removeSelectors).remove(); // Remove unwanted tags
return $.html(); // Return the cleaned HTML
}
// Function to use Cheerio to select text from the html, and attempts to clean it a bit
export function selectHtml(html, selector = "h1", removeSelectors = "style, script, link, noscript, frame, iframe, comment") {
const $ = cheerio.load(html); // Load the cleaned HTML into Cheerio
$(removeSelectors).remove(); // Remove unwanted tags
// Select and get text, then clean up extra whitespace
const res = $(selector)
.map((i, el) => $(el).text().trim()) // Trim each element's text
.get()
.join(" "); // Join with a single space between texts
// Replace multiple spaces/newlines with a single space and trim final result
return res.replace(/\s+/g, ' ').trim();
}
export async function convertHtml(html, option = "semantic") {
switch (option) {
case "markdown":
return convertHtmlToSemanticMarkdown(html);
September 10, 2024