178 lines
5.3 KiB
TypeScript
178 lines
5.3 KiB
TypeScript
import fs from "node:fs/promises";
|
||
import path from "node:path";
|
||
import articles from "./articles";
|
||
import jsdom from "jsdom";
|
||
|
||
const DUMPS_LOCATION = "article-dumps";
|
||
const PROCESSED_LOCATION = "processed-articles";
|
||
|
||
export async function scrapeArticlesMainContent() {
|
||
const promises = articles.map((article) => {
|
||
return new Promise<string>(async (resolve, reject) => {
|
||
let text;
|
||
try {
|
||
text = await (await fetch(article.url)).text();
|
||
} catch (e) {
|
||
console.log("e:", article.url);
|
||
reject(`error occurred with this one: ${e}`);
|
||
return;
|
||
}
|
||
resolve(text);
|
||
});
|
||
});
|
||
|
||
const results = await Promise.allSettled(promises);
|
||
|
||
for (let i = 0; i < results.length; i++) {
|
||
const result = results[i];
|
||
if (result.status === "rejected") {
|
||
continue;
|
||
}
|
||
try {
|
||
const dom = new jsdom.JSDOM(result.value);
|
||
fs.writeFile(
|
||
path.resolve(".", DUMPS_LOCATION, articles[i].slug + ".html"),
|
||
dom.window.document.body.getElementsByClassName("entries").item(0)?.innerHTML ?? "",
|
||
);
|
||
} catch (e) {
|
||
console.log("d:", articles[i].url);
|
||
}
|
||
}
|
||
}
|
||
|
||
function deleteChildCommentsRecursive(node: Node) {
|
||
for (const child of node.childNodes) {
|
||
// comment node type === 8 === Node.COMMENT_NODE but not available here
|
||
if (child.nodeType === 8) {
|
||
child.remove();
|
||
} else {
|
||
deleteChildCommentsRecursive(child);
|
||
}
|
||
}
|
||
}
|
||
|
||
function setAsTitleIfContainsArticle(doc: Document, node: Node) {
|
||
const el = node as HTMLSpanElement;
|
||
if (el.innerHTML?.includes("A R T I C L E")) {
|
||
el.innerHTML = el.innerHTML.replace("A R T I C L E", "");
|
||
el.replaceWith(Object.assign(doc.createElement("h1"), { innerHTML: el.innerHTML }));
|
||
let existingTitle = doc.head.querySelector("title");
|
||
if (!existingTitle) {
|
||
existingTitle = doc.createElement("title");
|
||
}
|
||
doc.head.appendChild(Object.assign(doc.createElement("title"), { innerHTML: el.innerHTML.trim() }));
|
||
return true;
|
||
}
|
||
return false;
|
||
}
|
||
|
||
type ReplcementEntry = {
|
||
tag?: string | null;
|
||
attrs?: Partial<Record<string, any>>;
|
||
extra?: (doc: Document, node: Node) => boolean;
|
||
};
|
||
|
||
const selectorReplacementMap: Record<string, ReplcementEntry | null> = {
|
||
"title": null,
|
||
"ul": {
|
||
tag: null,
|
||
},
|
||
"i": {
|
||
tag: "em",
|
||
},
|
||
"font": {
|
||
tag: null,
|
||
},
|
||
"p": {
|
||
tag: "p",
|
||
},
|
||
"div": {
|
||
tag: null,
|
||
},
|
||
"img": null,
|
||
"br": null,
|
||
"wbr": null,
|
||
"b": {
|
||
tag: "strong",
|
||
extra: setAsTitleIfContainsArticle,
|
||
},
|
||
"center": null,
|
||
"hr": {
|
||
tag: "hr",
|
||
},
|
||
"table": null,
|
||
"span.title": {
|
||
tag: "header",
|
||
attrs: {
|
||
className: "title",
|
||
},
|
||
extra: setAsTitleIfContainsArticle,
|
||
},
|
||
"span.posted": {
|
||
tag: "article",
|
||
attrs: {
|
||
className: "posted",
|
||
},
|
||
},
|
||
} as const;
|
||
|
||
function forEachTextNode(doc: Document, root: Node, cb: (doc: Document, node: Text) => void) {
|
||
for (const child of root.childNodes) {
|
||
// text node type === 3 === Node.TEXT_NODE but not available here
|
||
if (child.nodeType === 3) {
|
||
cb(doc, child as Text);
|
||
} else {
|
||
forEachTextNode(doc, child, cb);
|
||
}
|
||
}
|
||
}
|
||
|
||
async function cleanupFile(fileName: string) {
|
||
const filePath = path.resolve(".", DUMPS_LOCATION, fileName);
|
||
const { window } = new jsdom.JSDOM((await fs.readFile(filePath)).toString());
|
||
const document = window.document;
|
||
for (const selector in selectorReplacementMap) {
|
||
const replacement = selectorReplacementMap[selector];
|
||
for (const node of document.querySelectorAll(selector).values()) {
|
||
if (replacement) {
|
||
if (replacement.extra?.(document, node)) {
|
||
continue;
|
||
}
|
||
const newNode = replacement.tag
|
||
? document.createElement(replacement.tag)
|
||
: document.createDocumentFragment();
|
||
newNode.replaceChildren(...node.childNodes);
|
||
Object.assign(newNode, { ...replacement.attrs ?? {} });
|
||
node.replaceWith(newNode);
|
||
} else {
|
||
node.remove();
|
||
}
|
||
}
|
||
}
|
||
forEachTextNode(document, document.documentElement, (doc: Document, node: Text) => {
|
||
if (node.textContent?.match(/\s*=+\s*/)) {
|
||
node.replaceWith(doc.createElement("hr"));
|
||
return true;
|
||
} else if (node.textContent?.includes("REFERENCES")) {
|
||
node.replaceWith(Object.assign(doc.createElement("h3"), { innerHTML: node.textContent.trim() }));
|
||
return true;
|
||
}
|
||
return false;
|
||
});
|
||
deleteChildCommentsRecursive(document.documentElement);
|
||
fs.writeFile(
|
||
path.resolve(".", PROCESSED_LOCATION, fileName),
|
||
document.documentElement.outerHTML.replaceAll(/<2F>/g, '"'),
|
||
);
|
||
}
|
||
|
||
async function cleanup() {
|
||
const promises: Promise<unknown>[] = [];
|
||
for (const fileName of await fs.readdir(path.resolve(".", "article-dumps"))) {
|
||
promises.push(cleanupFile(fileName));
|
||
}
|
||
await Promise.allSettled(promises);
|
||
}
|
||
|
||
cleanup();
|