import { fetchText } from "https://esm.town/v/stevekrouse/fetchText?v=6";
import { load } from "npm:cheerio";
import { format } from "npm:date-fns";
const todayFormatted = format(new Date(), "MMMM d").replace(" ", "_");
const onThisDayUrl = `https://en.wikipedia.org/wiki/${todayFormatted}`;
console.log({ todayFormatted, onThisDayUrl });
const genericBodyParse = ($) => {
const body = $("#mw-content-text").first().text();
console.log("body", body);
const bodySplit = body.split(".mw-parser-output");
let parsedBody = bodySplit[0] ?? bodySplit[1] ?? bodySplit[2];
console.log("parsedBody1", parsedBody);
parsedBody = parsedBody.split("\n").slice(1, -1).join("\n").trim();
console.log("parsedBody2", parsedBody);
return parsedBody;
};
const fetchAndParsePage = async (url: string) => {
const html = await fetchText(url);
const $ = load(html);
let parsedBody;
if (url == onThisDayUrl) {
parsedBody = $(".mw-content-ltr > ul").text();
const maxLength = 9_000;
parsedBody = parsedBody.slice(0, maxLength);
} else {
parsedBody = genericBodyParse($);
}
console.log({ url, parsedBody });
return parsedBody;
};
const allowedFormats = ["html", "json", "text"];
export const wikipediaToday = async (req: Request) => {
const searchParams = new URL(req.url).searchParams;
const url = searchParams.get("url") ?? onThisDayUrl;
const format = searchParams.get("format") ?? "html";
if (!allowedFormats.includes(format)) {
return new Response(
`unsupported format '${format}'. allowed formats: ${
allowedFormats.join(
", ",
)
}`,
{ status: 400 },
);
}
const data = await fetchAndParsePage(url);
if (format == "json") {
return Response.json({ data: data.split("\n") });
} else if (format == "html") {
return new Response(
`<!DOCTYPE html><html lang="en"><body><pre>${data}</pre></body></html>`,
{
headers: { "Content-Type": "text/html" },
},
);
} else if (format == "text") {
return new Response(data, { headers: { "Content-Type": "text/plain" } });
} else {
throw new Error("unreachable");
}
};