1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
import { cheerio } from "https://deno.land/x/cheerio@1.0.7/mod.ts";
import html2pug from 'npm:html2pug';
import { convertHtmlToMarkdown as semanticMarkdown } from 'npm:@yawnxyz/dom-to-semantic-markdown@1.2.9';
// import { convertHtmlToMarkdown as semanticMarkdown } from 'npm:dom-to-semantic-markdown'; // uses Node so won't run here
import { JSDOM } from 'npm:jsdom';
// Function to fetch HTML from a URL and remove <style> and <script> tags
export async function fetchHtml(url, removeSelectors = "style, script, link, noscript, frame, iframe, comment") {
try {
const response = await fetch(url);
const html = await response.text();
// Load the HTML into Cheerio
const $ = cheerio.load(html);
$(removeSelectors).remove(); // Remove unwanted tags
return $.html();
} catch (error) {
console.error("Error fetching the URL:", error);
return null;
}
}
// Function to use Cheerio to select text from the html, and attempts to clean it a bit
export function selectHtml(html, selector = "h1", removeSelectors = "style, script, link, noscript, frame, iframe, comment") {
const $ = cheerio.load(html); // Load the cleaned HTML into Cheerio
$(removeSelectors).remove(); // Remove unwanted tags
// Select and get text, then clean up extra whitespace
const res = $(selector)
.map((i, el) => $(el).text().trim()) // Trim each element's text
.get()
.join(" "); // Join with a single space between texts
// Replace multiple spaces/newlines with a single space and trim final result
return res.replace(/\s+/g, ' ').trim();
}
// Function to convert HTML to Pug using html2pug
export function convertHtmlToPug(html, options = { tabs: true }) {
const pug = html2pug(html, options);
return pug;
}
// https://github.com/romansky/dom-to-semantic-markdown
export interface ConversionOptions {
websiteDomain?: string;
extractMainContent?: boolean;
refifyUrls?: boolean;
debug?: boolean;
overrideDOMParser?: DOMParser;
enableTableColumnTracking?: boolean;
overrideElementProcessing?: (element: Element, options: ConversionOptions, indentLevel: number) => SemanticMarkdownAST[] | undefined;
processUnhandledElement?: (element: Element, options: ConversionOptions, indentLevel: number) => SemanticMarkdownAST[] | undefined;
overrideNodeRenderer?: (node: SemanticMarkdownAST, options: ConversionOptions, indentLevel: number) => string | undefined;
renderCustomNode?: (node: CustomNode, options: ConversionOptions, indentLevel: number) => string | undefined;
includeMetaData?: 'basic' | 'extended';
}
export function convertHtmlToMarkdown(htmlStr: string, options?: ConversionOptions): string {
const dom = new JSDOM(htmlStr);
const markdown = semanticMarkdown(htmlStr, { ...options, overrideDOMParser: new dom.window.DOMParser() });
return markdown
}
// Example usage
(async () => {
// let url = "https://example.com";
// url = "https://cnn.com";
let url = "https://softwareyoga.com/latency-numbers-everyone-should-know";
const cleanedHTML = await fetchHtml(url); // Fetch and clean the HTML
if (cleanedHTML) {
const markdown = convertHtmlToMarkdown(cleanedHTML, {
enableTableColumnTracking: true
}); // Select elements and clean
console.log("Semantic Markdown:", markdown); // Print the selected text
// const text = selectHtml(cleanedHTML, "body"); // Select elements and clean
// console.log("Selected Text:", text); // Print the selected text
// const pug = convertHtmlToPug(cleanedHTML); // Convert HTML to Pug
// console.log("Converted Pug:\n", pug); // Print the converted Pug template
}
})();