This commit is contained in:
Daniel Ledda
2024-10-31 23:46:23 +01:00
parent 314ccaa677
commit bcb820f35e
36 changed files with 4427 additions and 61 deletions

View File

@@ -0,0 +1,177 @@
import fs from "node:fs/promises";
import path from "node:path";
import articles from "./articles";
import jsdom from "jsdom";
const DUMPS_LOCATION = "article-dumps";
const PROCESSED_LOCATION = "processed-articles";
export async function scrapeArticlesMainContent() {
const promises = articles.map((article) => {
return new Promise<string>(async (resolve, reject) => {
let text;
try {
text = await (await fetch(article.url)).text();
} catch (e) {
console.log("e:", article.url);
reject(`error occurred with this one: ${e}`);
return;
}
resolve(text);
});
});
const results = await Promise.allSettled(promises);
for (let i = 0; i < results.length; i++) {
const result = results[i];
if (result.status === "rejected") {
continue;
}
try {
const dom = new jsdom.JSDOM(result.value);
fs.writeFile(
path.resolve(".", DUMPS_LOCATION, articles[i].slug + ".html"),
dom.window.document.body.getElementsByClassName("entries").item(0)?.innerHTML ?? "",
);
} catch (e) {
console.log("d:", articles[i].url);
}
}
}
function deleteChildCommentsRecursive(node: Node) {
for (const child of node.childNodes) {
// comment node type === 8 === Node.COMMENT_NODE but not available here
if (child.nodeType === 8) {
child.remove();
} else {
deleteChildCommentsRecursive(child);
}
}
}
function setAsTitleIfContainsArticle(doc: Document, node: Node) {
const el = node as HTMLSpanElement;
if (el.innerHTML?.includes("A R T I C L E")) {
el.innerHTML = el.innerHTML.replace("A R T I C L E", "");
el.replaceWith(Object.assign(doc.createElement("h1"), { innerHTML: el.innerHTML }));
let existingTitle = doc.head.querySelector("title");
if (!existingTitle) {
existingTitle = doc.createElement("title");
}
doc.head.appendChild(Object.assign(doc.createElement("title"), { innerHTML: el.innerHTML.trim() }));
return true;
}
return false;
}
type ReplcementEntry = {
tag?: string | null;
attrs?: Partial<Record<string, any>>;
extra?: (doc: Document, node: Node) => boolean;
};
const selectorReplacementMap: Record<string, ReplcementEntry | null> = {
"title": null,
"ul": {
tag: null,
},
"i": {
tag: "em",
},
"font": {
tag: null,
},
"p": {
tag: "p",
},
"div": {
tag: null,
},
"img": null,
"br": null,
"wbr": null,
"b": {
tag: "strong",
extra: setAsTitleIfContainsArticle,
},
"center": null,
"hr": {
tag: "hr",
},
"table": null,
"span.title": {
tag: "header",
attrs: {
className: "title",
},
extra: setAsTitleIfContainsArticle,
},
"span.posted": {
tag: "article",
attrs: {
className: "posted",
},
},
} as const;
function forEachTextNode(doc: Document, root: Node, cb: (doc: Document, node: Text) => void) {
for (const child of root.childNodes) {
// text node type === 3 === Node.TEXT_NODE but not available here
if (child.nodeType === 3) {
cb(doc, child as Text);
} else {
forEachTextNode(doc, child, cb);
}
}
}
async function cleanupFile(fileName: string) {
const filePath = path.resolve(".", DUMPS_LOCATION, fileName);
const { window } = new jsdom.JSDOM((await fs.readFile(filePath)).toString());
const document = window.document;
for (const selector in selectorReplacementMap) {
const replacement = selectorReplacementMap[selector];
for (const node of document.querySelectorAll(selector).values()) {
if (replacement) {
if (replacement.extra?.(document, node)) {
continue;
}
const newNode = replacement.tag
? document.createElement(replacement.tag)
: document.createDocumentFragment();
newNode.replaceChildren(...node.childNodes);
Object.assign(newNode, { ...replacement.attrs ?? {} });
node.replaceWith(newNode);
} else {
node.remove();
}
}
}
forEachTextNode(document, document.documentElement, (doc: Document, node: Text) => {
if (node.textContent?.match(/\s*=+\s*/)) {
node.replaceWith(doc.createElement("hr"));
return true;
} else if (node.textContent?.includes("REFERENCES")) {
node.replaceWith(Object.assign(doc.createElement("h3"), { innerHTML: node.textContent.trim() }));
return true;
}
return false;
});
deleteChildCommentsRecursive(document.documentElement);
fs.writeFile(
path.resolve(".", PROCESSED_LOCATION, fileName),
document.documentElement.outerHTML.replaceAll(/<2F>/g, '"'),
);
}
async function cleanup() {
const promises: Promise<unknown>[] = [];
for (const fileName of await fs.readdir(path.resolve(".", "article-dumps"))) {
promises.push(cleanupFile(fileName));
}
await Promise.allSettled(promises);
}
cleanup();