import { convertHtmlToMarkdown } from 'npm:@yawnxyz/dom-to-semantic-markdown';
import { JSDOM } from 'npm:jsdom';
import { cheerio } from "https://deno.land/x/cheerio@1.0.7/mod.ts";
import * as turndownPluginGfm from "npm:turndown";
import html2pug from 'npm:html2pug';
export async function getHtmlFromUrl(url) {
try {
const response = await fetch(url);
const html = await response.text();
const $ = cheerio.load(html);
return $.html();
} catch (error) {
console.error("Error fetching the URL:", error);
return null;
}
}
export async function getMetadataFromUrl(url) {
const response = await fetch(url, {
method: "GET",
headers: new Headers({
"User-Agent":
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8",
"Accept-Language": "en-US,en;q=0.5",
"Sec-Fetch-Site": "cross-site",
"Sec-Fetch-Mode": "navigate",
"Sec-Fetch-User": "?1",
"Sec-Fetch-Dest": "document",
"Referer": "https://www.google.com/",
"sec-ch-ua": `"Not A Brand";v="99", "Google Chrome";v="91", "Chromium";v="91"`,
"sec-ch-ua-mobile": "?0",
"sec-ch-ua-platform": `"macOS"`,
"Upgrade-Insecure-Requests": "1",
}),
});
const html = await response.text();
const dom = new JSDOM(html);
const doc = new dom.window.DOMParser().parseFromString(html, "text/html");
const metadata: { [key: string]: string } = {};
const title = doc.getElementsByTagName("title")[0]?.textContent || "";
metadata.title = title;
const metaTags = doc.getElementsByTagName("meta");
for (const metaTag of metaTags) {
const name = metaTag.getAttribute("name") || metaTag.getAttribute("property");
const content = metaTag.getAttribute("content");
if (name && content) {
metadata[name] = content;
}
}
const cleanedUrl = url.endsWith("/") ? url.slice(0, -1) : url;
const slug = new URL(cleanedUrl).pathname.split("/").pop() || "";
metadata.slug = slug;
return metadata;
}
export function removeSelectorsFromHtml(html, removeSelectors = "style, script, link, noscript, frame, iframe, comment") {
const $ = cheerio.load(html);
$(removeSelectors).remove();
return $.html();
}
export function selectHtml(html, selector = "h1", removeSelectors = "style, script, link, noscript, frame, iframe, comment") {
const $ = cheerio.load(html);
$(removeSelectors).remove();
const res = $(selector)
.map((i, el) => $(el).text().trim())
.get()
.join(" ");
return res.replace(/\s+/g, ' ').trim();
}
export async function convertHtml(html, option = "semantic") {
switch (option) {
case "markdown":
return convertHtmlToSemanticMarkdown(html);