Val Town is a social website to write and deploy JavaScript.
Build APIs and schedule functions from your browser.
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
// This program crawls Wikipedia pages starting from an initial URL,
// following links in the infobox and collecting page titles.
// It uses the fetch API to make HTTP requests and cheerio for HTML parsing.
import cheerio from "https://esm.sh/cheerio@1.0.0-rc.12";
const TITLE_SELECTOR = "#firstHeading > span";
async function crawlWikipedia(url: string, visited: Set<string> = new Set()): Promise<string[]> {
if (visited.has(url)) {
return [];
}
visited.add(url);
const response = await fetch(url);
const html = await response.text();
const $ = cheerio.load(html);
const title = $(TITLE_SELECTOR).text().trim();
let nextLink;
$('table.infobox.vcard > tbody > tr').each((_, row) => {
const $row = $(row);
const label = $row.find('th.infobox-label').text().trim();
if (label === "Predecessor") {
nextLink = $row.find('td a').attr('href');
return false; // break the loop
}
});
if (!nextLink) {
return [title];
}
const nextUrl = new URL(nextLink, url).toString();
const subResults = await crawlWikipedia(nextUrl, visited);
return [title, ...subResults];
}
export default async function server(req: Request): Promise<Response> {
const url = new URL(req.url);
const startUrl = url.searchParams.get('url');
if (!startUrl || !startUrl.startsWith('https://en.wikipedia.org/')) {
return new Response("Please provide a valid Wikipedia URL as a 'url' query parameter.", { status: 400 });
}
try {
const results = await crawlWikipedia(startUrl);
return new Response(JSON.stringify(results, null, 2), {
headers: { 'Content-Type': 'application/json' },
});
} catch (error) {
console.error('Error during crawling:', error);
return new Response("An error occurred while crawling Wikipedia.", { status: 500 });
}
}
jdan-emperorofjapancrawler.web.val.run
September 1, 2024