mirror of
https://github.com/nkoehring/250kb-club.git
synced 2025-04-20 14:19:03 +02:00
136 lines
3.6 KiB
TypeScript
136 lines
3.6 KiB
TypeScript
import "./index.d.ts";
|
|
import {
|
|
url2title,
|
|
getPageRecord,
|
|
writeRecord,
|
|
removeRecord,
|
|
} from "./analyser/toolkit.ts";
|
|
import {
|
|
requestMetricsRun,
|
|
checkStatus,
|
|
retrieveMetrics,
|
|
} from "./analyser/metrics.ts";
|
|
|
|
const INPUT_FILE = Deno.args[0] ?? "./pages.txt";
|
|
const OUTPUT_PATH = Deno.args[1] ?? "./content"; // results are written here
|
|
const RECHECK_THRESHOLD = 60 * 60 * 24 * 7 * 1000; // recheck pages older than 1 week
|
|
const REJECT_THRESHOLD = 262144; // 256KB (duh)
|
|
const PARALLEL_JOBS = 3; // max YLT jobs
|
|
|
|
const now = Date.now();
|
|
const pages = await getPageList(); // all pages
|
|
const pagesUpdating: string[] = []; // currently running ylt jobs
|
|
|
|
async function getPageList(): Promise<string[]> {
|
|
const inputContent = await Deno.readTextFile(INPUT_FILE);
|
|
return inputContent.split("\n").filter((line) => line.startsWith("http"));
|
|
}
|
|
|
|
async function updateRecord(runId: string, url: string): Promise<boolean> {
|
|
const oldRecord = await getPageRecord(url, OUTPUT_PATH);
|
|
const metrics = await retrieveMetrics(runId);
|
|
|
|
if (!metrics) {
|
|
console.error("failed to retrieve results for", url, runId);
|
|
return false;
|
|
}
|
|
|
|
// poor mans toISODateString
|
|
const now = new Date().toISOString().split("T")[0];
|
|
|
|
const weight = metrics.metrics.contentLength;
|
|
const ratio = Math.round((metrics.metrics.htmlSize / weight) * 100);
|
|
|
|
if (weight > REJECT_THRESHOLD) {
|
|
console.log(url, "rejected! Weighs", Math.round(weight / 1024), "kb");
|
|
if (oldRecord) {
|
|
removeRecord(url, OUTPUT_PATH).catch(() => {
|
|
console.error("Failed to remove old record of rejected url", url);
|
|
});
|
|
}
|
|
return false;
|
|
}
|
|
|
|
const record: PageRecord = {
|
|
title: url2title(url),
|
|
date: oldRecord === null ? now : oldRecord.date,
|
|
updated: now,
|
|
weight,
|
|
extra: {
|
|
source: url,
|
|
ratio,
|
|
size: Math.round(weight / 1024),
|
|
},
|
|
};
|
|
|
|
const success = await writeRecord(record, url, OUTPUT_PATH);
|
|
|
|
if (success) {
|
|
console.log(url, "successfully updated");
|
|
} else {
|
|
console.error(url, "record could not be written!");
|
|
}
|
|
}
|
|
|
|
async function checkPage(url: string) {
|
|
const record = await getPageRecord(url, OUTPUT_PATH);
|
|
const lastUpdated = Date.parse(record?.updated || "");
|
|
const needsCheck = !record || now - lastUpdated > RECHECK_THRESHOLD;
|
|
|
|
if (!needsCheck) {
|
|
console.log(url, "is up-to-date");
|
|
return true;
|
|
}
|
|
|
|
const runId = await requestMetricsRun(url);
|
|
if (!runId) {
|
|
console.error(url, "updating failed!");
|
|
return false;
|
|
}
|
|
|
|
console.log(url, "new or outdated, runId is", runId);
|
|
return runId;
|
|
}
|
|
|
|
function sleep(duration: number) {
|
|
return new Promise((resolve) => {
|
|
setTimeout(() => resolve(), duration);
|
|
});
|
|
}
|
|
|
|
async function handleBatch() {
|
|
if (!pages.length) return; // done, yeah!
|
|
|
|
const batch = pages.splice(0, PARALLEL_JOBS);
|
|
const jobs = batch.map((url) => checkPage(url));
|
|
|
|
while (jobs.length) {
|
|
// take the first job and check
|
|
// if the check fails, it will be added back to the end of the list
|
|
const runId = await jobs.shift();
|
|
|
|
// page is up-to-date or YLT has an error
|
|
if (runId === true || runId === false) continue;
|
|
|
|
// TODO: handle failures more gracefully
|
|
const { url, status } = await checkStatus(runId);
|
|
|
|
if (status === "failed") {
|
|
console.error(url, "YLT analysis failed");
|
|
continue;
|
|
} else if (status === "complete") {
|
|
console.log(url, "updating record...");
|
|
await updateRecord(runId, url);
|
|
continue;
|
|
} else {
|
|
// not done yet, add it back
|
|
jobs.push(runId);
|
|
// wait a bit before checking again
|
|
await sleep(1000);
|
|
}
|
|
}
|
|
|
|
handleBatch();
|
|
}
|
|
|
|
handleBatch();
|