Public
Script
Readme

This module provides functions to convert HTML to various formats such as Markdown, Pug, and Turndown. It also includes utilities to fetch HTML from a URL, extract metadata, and clean HTML by removing unwanted selectors. Dependencies include Cheerio for HTML parsing, JSDOM for DOM manipulation, and Turndown for converting HTML to Markdown.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
// import { fetch } from "https://esm.town/v/std/fetch";
import { convertHtmlToMarkdown } from 'npm:@yawnxyz/dom-to-semantic-markdown';
import { JSDOM } from 'npm:jsdom';
import { cheerio } from "https://deno.land/x/cheerio@1.0.7/mod.ts";
import * as turndownPluginGfm from "npm:turndown";
// import { plugin } from "npm:turndown-plugin-gfm";
import html2pug from 'npm:html2pug';
export async function getHtmlFromUrl(url) {
try {
const response = await fetch(url);
const html = await response.text();
// Load the HTML into Cheerio
const $ = cheerio.load(html);
return $.html();
} catch (error) {
console.error("Error fetching the URL:", error);
return null;
}
}
// todo, pass in html optional, and skip fetch?
export async function getMetadataFromUrl(url) {
const response = await fetch(url, {
method: "GET",
headers: new Headers({
"User-Agent":
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8",
"Accept-Language": "en-US,en;q=0.5",
"Sec-Fetch-Site": "cross-site",
"Sec-Fetch-Mode": "navigate",
"Sec-Fetch-User": "?1",
"Sec-Fetch-Dest": "document",
"Referer": "https://www.google.com/",
"sec-ch-ua": `"Not A Brand";v="99", "Google Chrome";v="91", "Chromium";v="91"`,
"sec-ch-ua-mobile": "?0",
"sec-ch-ua-platform": `"macOS"`,
"Upgrade-Insecure-Requests": "1",
}),
});
const html = await response.text();
const dom = new JSDOM(html);
const doc = new dom.window.DOMParser().parseFromString(html, "text/html");
const metadata: { [key: string]: string } = {};
const title = doc.getElementsByTagName("title")[0]?.textContent || "";
metadata.title = title;
const metaTags = doc.getElementsByTagName("meta");
for (const metaTag of metaTags) {
const name = metaTag.getAttribute("name") || metaTag.getAttribute("property");
const content = metaTag.getAttribute("content");
if (name && content) {
metadata[name] = content;
}
}
const cleanedUrl = url.endsWith("/") ? url.slice(0, -1) : url;
const slug = new URL(cleanedUrl).pathname.split("/").pop() || "";
metadata.slug = slug;
return metadata;
}
// Function to remove unwanted selectors from the HTML and return the cleaned HTML
export function removeSelectorsFromHtml(html, removeSelectors = "style, script, link, noscript, frame, iframe, comment") {
const $ = cheerio.load(html); // Load the HTML into Cheerio
$(removeSelectors).remove(); // Remove unwanted tags
return $.html(); // Return the cleaned HTML
}
// Function to use Cheerio to select text from the html, and attempts to clean it a bit
export function selectHtml(html, selector = "h1", removeSelectors = "style, script, link, noscript, frame, iframe, comment") {
const $ = cheerio.load(html); // Load the cleaned HTML into Cheerio
$(removeSelectors).remove(); // Remove unwanted tags
// Select and get text, then clean up extra whitespace
const res = $(selector)
.map((i, el) => $(el).text().trim()) // Trim each element's text
.get()
.join(" "); // Join with a single space between texts
// Replace multiple spaces/newlines with a single space and trim final result
return res.replace(/\s+/g, ' ').trim();
}
export async function convertHtml(html, option = "semantic") {
switch (option) {
case "markdown":
return convertHtmlToSemanticMarkdown(html);
Val Town is a social website to write and deploy JavaScript.
Build APIs and schedule functions from your browser.
Comments
Nobody has commented on this val yet: be the first!
September 10, 2024