|
@@ -0,0 +1,53 @@
|
|
|
+// For more information, see https://crawlee.dev/
|
|
|
+import { PlaywrightCrawler } from "crawlee";
|
|
|
+import { readFile, writeFile } from "fs/promises";
|
|
|
+import { glob } from "glob";
|
|
|
+import { config } from "../config";
|
|
|
+
|
|
|
+if (process.env.NO_CRAWL !== "true") {
|
|
|
+ // PlaywrightCrawler crawls the web using a headless
|
|
|
+ // browser controlled by the Playwright library.
|
|
|
+ const crawler = new PlaywrightCrawler({
|
|
|
+ // Use the requestHandler to process each of the crawled pages.
|
|
|
+ async requestHandler({ request, page, enqueueLinks, log, pushData }) {
|
|
|
+ const title = await page.title();
|
|
|
+ log.info(`Title of ${request.loadedUrl} is '${title}'`);
|
|
|
+
|
|
|
+ const html = await page.evaluate(() => {
|
|
|
+ const el = document.querySelector(
|
|
|
+ ".docs-builder-container"
|
|
|
+ ) as HTMLElement | null;
|
|
|
+
|
|
|
+ return el?.innerText || "";
|
|
|
+ });
|
|
|
+
|
|
|
+ // Save results as JSON to ./storage/datasets/default
|
|
|
+ await pushData({ title, url: request.loadedUrl, html });
|
|
|
+
|
|
|
+ // Extract links from the current page
|
|
|
+ // and add them to the crawling queue.
|
|
|
+ await enqueueLinks({
|
|
|
+ globs: [config.glob],
|
|
|
+ });
|
|
|
+ },
|
|
|
+ // Comment this option to scrape the full website.
|
|
|
+ maxRequestsPerCrawl: config.maxPagesToCrawl,
|
|
|
+ // Uncomment this option to see the browser window.
|
|
|
+ // headless: false,
|
|
|
+ });
|
|
|
+
|
|
|
+ // Add first URL to the queue and start the crawl.
|
|
|
+ await crawler.run([config.url]);
|
|
|
+}
|
|
|
+
|
|
|
+const jsonFiles = await glob("storage/datasets/default/*.json", {
|
|
|
+ absolute: true,
|
|
|
+});
|
|
|
+
|
|
|
+const results = [];
|
|
|
+for (const file of jsonFiles) {
|
|
|
+ const data = JSON.parse(await readFile(file, "utf-8"));
|
|
|
+ results.push(data);
|
|
|
+}
|
|
|
+
|
|
|
+await writeFile(config.outputFileName, JSON.stringify(results, null, 2));
|