瀏覽代碼

Add support for downloading URLs from sitemap.xml

guillermoscript 1 年之前
父節點
當前提交
adfcc5e10e
共有 1 個文件被更改,包括 15 次插入3 次删除
  1. 15 3
      src/main.ts

+ 15 - 3
src/main.ts

@@ -1,5 +1,5 @@
 // For more information, see https://crawlee.dev/
-import { PlaywrightCrawler } from "crawlee";
+import { PlaywrightCrawler, downloadListOfUrls } from "crawlee";
 import { readFile, writeFile } from "fs/promises";
 import { glob } from "glob";
 import { config } from "../config.js";
@@ -73,8 +73,20 @@ if (process.env.NO_CRAWL !== "true") {
     // headless: false,
   });
 
-  // Add first URL to the queue and start the crawl.
-  await crawler.run([config.url]);
+  const isUrlASitemap = config.url.endsWith("sitemap.xml");
+
+  if (isUrlASitemap) {
+    const listOfUrls = await downloadListOfUrls({ url: config.url });
+
+    // Add the initial URL to the crawling queue.
+    await crawler.addRequests(listOfUrls);
+
+    // Run the crawler
+    await crawler.run();
+  } else {
+    // Add first URL to the queue and start the crawl.
+    await crawler.run([config.url]);
+  }
 }
 
 const jsonFiles = await glob("storage/datasets/default/*.json", {