nice
This commit is contained in:
177
raypeat-articles/process-and-cleanup.ts
Normal file
177
raypeat-articles/process-and-cleanup.ts
Normal file
@@ -0,0 +1,177 @@
|
||||
import fs from "node:fs/promises";
|
||||
import path from "node:path";
|
||||
import articles from "./articles";
|
||||
import jsdom from "jsdom";
|
||||
|
||||
const DUMPS_LOCATION = "article-dumps";
|
||||
const PROCESSED_LOCATION = "processed-articles";
|
||||
|
||||
export async function scrapeArticlesMainContent() {
|
||||
const promises = articles.map((article) => {
|
||||
return new Promise<string>(async (resolve, reject) => {
|
||||
let text;
|
||||
try {
|
||||
text = await (await fetch(article.url)).text();
|
||||
} catch (e) {
|
||||
console.log("e:", article.url);
|
||||
reject(`error occurred with this one: ${e}`);
|
||||
return;
|
||||
}
|
||||
resolve(text);
|
||||
});
|
||||
});
|
||||
|
||||
const results = await Promise.allSettled(promises);
|
||||
|
||||
for (let i = 0; i < results.length; i++) {
|
||||
const result = results[i];
|
||||
if (result.status === "rejected") {
|
||||
continue;
|
||||
}
|
||||
try {
|
||||
const dom = new jsdom.JSDOM(result.value);
|
||||
fs.writeFile(
|
||||
path.resolve(".", DUMPS_LOCATION, articles[i].slug + ".html"),
|
||||
dom.window.document.body.getElementsByClassName("entries").item(0)?.innerHTML ?? "",
|
||||
);
|
||||
} catch (e) {
|
||||
console.log("d:", articles[i].url);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
function deleteChildCommentsRecursive(node: Node) {
|
||||
for (const child of node.childNodes) {
|
||||
// comment node type === 8 === Node.COMMENT_NODE but not available here
|
||||
if (child.nodeType === 8) {
|
||||
child.remove();
|
||||
} else {
|
||||
deleteChildCommentsRecursive(child);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
function setAsTitleIfContainsArticle(doc: Document, node: Node) {
|
||||
const el = node as HTMLSpanElement;
|
||||
if (el.innerHTML?.includes("A R T I C L E")) {
|
||||
el.innerHTML = el.innerHTML.replace("A R T I C L E", "");
|
||||
el.replaceWith(Object.assign(doc.createElement("h1"), { innerHTML: el.innerHTML }));
|
||||
let existingTitle = doc.head.querySelector("title");
|
||||
if (!existingTitle) {
|
||||
existingTitle = doc.createElement("title");
|
||||
}
|
||||
doc.head.appendChild(Object.assign(doc.createElement("title"), { innerHTML: el.innerHTML.trim() }));
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
type ReplcementEntry = {
|
||||
tag?: string | null;
|
||||
attrs?: Partial<Record<string, any>>;
|
||||
extra?: (doc: Document, node: Node) => boolean;
|
||||
};
|
||||
|
||||
const selectorReplacementMap: Record<string, ReplcementEntry | null> = {
|
||||
"title": null,
|
||||
"ul": {
|
||||
tag: null,
|
||||
},
|
||||
"i": {
|
||||
tag: "em",
|
||||
},
|
||||
"font": {
|
||||
tag: null,
|
||||
},
|
||||
"p": {
|
||||
tag: "p",
|
||||
},
|
||||
"div": {
|
||||
tag: null,
|
||||
},
|
||||
"img": null,
|
||||
"br": null,
|
||||
"wbr": null,
|
||||
"b": {
|
||||
tag: "strong",
|
||||
extra: setAsTitleIfContainsArticle,
|
||||
},
|
||||
"center": null,
|
||||
"hr": {
|
||||
tag: "hr",
|
||||
},
|
||||
"table": null,
|
||||
"span.title": {
|
||||
tag: "header",
|
||||
attrs: {
|
||||
className: "title",
|
||||
},
|
||||
extra: setAsTitleIfContainsArticle,
|
||||
},
|
||||
"span.posted": {
|
||||
tag: "article",
|
||||
attrs: {
|
||||
className: "posted",
|
||||
},
|
||||
},
|
||||
} as const;
|
||||
|
||||
function forEachTextNode(doc: Document, root: Node, cb: (doc: Document, node: Text) => void) {
|
||||
for (const child of root.childNodes) {
|
||||
// text node type === 3 === Node.TEXT_NODE but not available here
|
||||
if (child.nodeType === 3) {
|
||||
cb(doc, child as Text);
|
||||
} else {
|
||||
forEachTextNode(doc, child, cb);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
async function cleanupFile(fileName: string) {
|
||||
const filePath = path.resolve(".", DUMPS_LOCATION, fileName);
|
||||
const { window } = new jsdom.JSDOM((await fs.readFile(filePath)).toString());
|
||||
const document = window.document;
|
||||
for (const selector in selectorReplacementMap) {
|
||||
const replacement = selectorReplacementMap[selector];
|
||||
for (const node of document.querySelectorAll(selector).values()) {
|
||||
if (replacement) {
|
||||
if (replacement.extra?.(document, node)) {
|
||||
continue;
|
||||
}
|
||||
const newNode = replacement.tag
|
||||
? document.createElement(replacement.tag)
|
||||
: document.createDocumentFragment();
|
||||
newNode.replaceChildren(...node.childNodes);
|
||||
Object.assign(newNode, { ...replacement.attrs ?? {} });
|
||||
node.replaceWith(newNode);
|
||||
} else {
|
||||
node.remove();
|
||||
}
|
||||
}
|
||||
}
|
||||
forEachTextNode(document, document.documentElement, (doc: Document, node: Text) => {
|
||||
if (node.textContent?.match(/\s*=+\s*/)) {
|
||||
node.replaceWith(doc.createElement("hr"));
|
||||
return true;
|
||||
} else if (node.textContent?.includes("REFERENCES")) {
|
||||
node.replaceWith(Object.assign(doc.createElement("h3"), { innerHTML: node.textContent.trim() }));
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
});
|
||||
deleteChildCommentsRecursive(document.documentElement);
|
||||
fs.writeFile(
|
||||
path.resolve(".", PROCESSED_LOCATION, fileName),
|
||||
document.documentElement.outerHTML.replaceAll(/<2F>/g, '"'),
|
||||
);
|
||||
}
|
||||
|
||||
async function cleanup() {
|
||||
const promises: Promise<unknown>[] = [];
|
||||
for (const fileName of await fs.readdir(path.resolve(".", "article-dumps"))) {
|
||||
promises.push(cleanupFile(fileName));
|
||||
}
|
||||
await Promise.allSettled(promises);
|
||||
}
|
||||
|
||||
cleanup();
|
||||
Reference in New Issue
Block a user