import { cheerio } from "https://deno.land/x/cheerio@1.0.7/mod.ts";
import html2pug from 'npm:html2pug';
import { convertHtmlToMarkdown as semanticMarkdown } from 'npm:@yawnxyz/dom-to-semantic-markdown@1.2.9';
import { JSDOM } from 'npm:jsdom';
export async function fetchHtml(url, removeSelectors = "style, script, link, noscript, frame, iframe, comment") {
try {
const response = await fetch(url);
const html = await response.text();
const $ = cheerio.load(html);
$(removeSelectors).remove();
return $.html();
} catch (error) {
console.error("Error fetching the URL:", error);
return null;
}
}
export function selectHtml(html, selector = "h1", removeSelectors = "style, script, link, noscript, frame, iframe, comment") {
const $ = cheerio.load(html);
$(removeSelectors).remove();
const res = $(selector)
.map((i, el) => $(el).text().trim())
.get()
.join(" ");
return res.replace(/\s+/g, ' ').trim();
}
export function convertHtmlToPug(html, options = { tabs: true }) {
const pug = html2pug(html, options);
return pug;
}
export interface ConversionOptions {
websiteDomain?: string;
extractMainContent?: boolean;
refifyUrls?: boolean;
debug?: boolean;
overrideDOMParser?: DOMParser;
enableTableColumnTracking?: boolean;
overrideElementProcessing?: (element: Element, options: ConversionOptions, indentLevel: number) => SemanticMarkdownAST[] | undefined;
processUnhandledElement?: (element: Element, options: ConversionOptions, indentLevel: number) => SemanticMarkdownAST[] | undefined;
overrideNodeRenderer?: (node: SemanticMarkdownAST, options: ConversionOptions, indentLevel: number) => string | undefined;
renderCustomNode?: (node: CustomNode, options: ConversionOptions, indentLevel: number) => string | undefined;
includeMetaData?: 'basic' | 'extended';
}
export function convertHtmlToMarkdown(htmlStr: string, options?: ConversionOptions): string {
const dom = new JSDOM(htmlStr);
const markdown = semanticMarkdown(htmlStr, { ...options, overrideDOMParser: new dom.window.DOMParser() });
return markdown
}
(async () => {
let url = "https://softwareyoga.com/latency-numbers-everyone-should-know";
const cleanedHTML = await fetchHtml(url);
if (cleanedHTML) {
const markdown = convertHtmlToMarkdown(cleanedHTML, {
enableTableColumnTracking: true
});
console.log("Semantic Markdown:", markdown);
}
})();