jamiedubs-wikipediatoday.web.val.run
Readme

fetch the contents of the Wikipedia "On this day in history" page. defaults to JSON output, but specify ?format=textor ?format=html for other outputs. e.g.

#wikipedia

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
import { fetchText } from "https://esm.town/v/stevekrouse/fetchText?v=6";
import { load } from "npm:cheerio";
import { format } from "npm:date-fns";
// As of July 20, 2024, Wikipedia has switched these URLs to e.g. "July_20" and "Selected_anniversaries/July_20"
// Previously: https://en.wikipedia.org/wiki/Wikipedia:On_this_day/Today
const todayFormatted = format(new Date(), "MMMM d").replace(" ", "_");
// const onThisDayUrl = `https://en.wikipedia.org/wiki/Wikipedia:Selected_anniversaries/${todayFormatted}`;
const onThisDayUrl = `https://en.wikipedia.org/wiki/${todayFormatted}`;
console.log({ todayFormatted, onThisDayUrl });
// Doing some homework on other interesting pages...
//
// Homepage sections:
// https://wikipedia.org
// - Today's Featured Article (TFA): https://en.wikipedia.org/w/index.php?title=Wikipedia:TFA
// - Did you know...
// - In The News: https://en.wikipedia.org/wiki/Portal:Current_events
// - On This Day in History: https://en.wikipedia.org/wiki/Wikipedia:On_this_day/Today
//
// Special pages:
// https://en.wikipedia.org/wiki/Special:SpecialPages
// - Recent additions: https://en.wikipedia.org/wiki/Wikipedia:Recent_additions
// - Random article: https://en.wikipedia.org/wiki/Special:Random
// - Random article in category (e.g. Internet memes): https://en.wikipedia.org/wiki/Special:RandomInCategory?wpcategory=Internet+memes
// - Search: TODO
//
// More ideas...
// Content portals: https://en.wikipedia.org/wiki/Wikipedia:Contents/Portals
//
const genericBodyParse = ($) => {
const body = $("#mw-content-text").first().text();
console.log("body", body);
const bodySplit = body.split(".mw-parser-output");
let parsedBody = bodySplit[0] ?? bodySplit[1] ?? bodySplit[2]; // awkward
console.log("parsedBody1", parsedBody);
parsedBody = parsedBody.split("\n").slice(1, -1).join("\n").trim();
console.log("parsedBody2", parsedBody);
return parsedBody;
};
const fetchAndParsePage = async (url: string) => {
const html = await fetchText(url);
const $ = load(html);
let parsedBody;
if (url == onThisDayUrl) {
// pull all <ul>'s out and combine together
// this is fetching all 3 major sections (Events, Births, Deaths) but I really only care about Events
parsedBody = $(".mw-content-ltr > ul").text();
// FIXME this isn't quite working but is what I want...
// parsedBody = $(".mw-heading").first().nextUntil(".mw-heading", "ul").text();
// FIXME in the meantime, hack to truncate :-(
const maxLength = 9_000;
parsedBody = parsedBody.slice(0, maxLength);
} else {
// rough default WP page parsing. pretty inconsistent
parsedBody = genericBodyParse($);
}
console.log({ url, parsedBody });
return parsedBody;
};
const allowedFormats = ["html", "json", "text"];
export const wikipediaToday = async (req: Request) => {
const searchParams = new URL(req.url).searchParams;
const url = searchParams.get("url") ?? onThisDayUrl;
const format = searchParams.get("format") ?? "html";
if (!allowedFormats.includes(format)) {
return new Response(
`unsupported format '${format}'. allowed formats: ${
allowedFormats.join(
", ",
)
}`,
{ status: 400 },
);
}
const data = await fetchAndParsePage(url);
if (format == "json") {
return Response.json({ data: data.split("\n") });
} else if (format == "html") {
return new Response(
`<!DOCTYPE html><html lang="en"><body><pre>${data}</pre></body></html>`,
{
headers: { "Content-Type": "text/html" },
},
);
} else if (format == "text") {
return new Response(data, { headers: { "Content-Type": "text/plain" } });
} else {
throw new Error("unreachable");
}
};
Val Town is a social website to write and deploy JavaScript.
Build APIs and schedule functions from your browser.
Comments
4
andrewn avatar
andrewn(Edited )

Why does the wikipedia link show a page for July 20th (it's July 25th)? Come on wikipedia! :-)

jamiedubs avatar

@andrewn huh! Looks like they might've renamed it to "Selected Anniversaries" and July 20th was the last entry?! very annoying

jamiedubs avatar

Looks they've switched the Main_Page to transclude from a URL like: {{Wikipedia:Selected anniversaries/{{#time:F j}}}}

jamiedubs avatar

sorta fixed now. including too much content! All events, births & deaths on that date. I only want Events but having trouble getting Cheerio to behave and ran out of time :)

August 14, 2024