Kaynağa Gözat

Merge pull request #26 from guillermoscript/sitemap-support

Refactor getPageHtml function to handle selector not found case, using body as fallback. Add support for downloading URLs from sitemap.xml. Update comments to let know that sitemap is supported
Steve Sewell 1 yıl önce
ebeveyn
işleme
10a71edde2
4 değiştirilmiş dosya ile 44 ekleme ve 6 silme
  1. BIN
      .DS_Store
  2. 7 1
      README.md
  3. 8 1
      src/config.ts
  4. 29 4
      src/core.ts

BIN
.DS_Store


Dosya farkı çok büyük olduğundan ihmal edildi
+ 7 - 1
README.md


Dosya farkı çok büyük olduğundan ihmal edildi
+ 8 - 1
src/config.ts


+ 29 - 4
src/core.ts

@@ -1,5 +1,5 @@
 // For more information, see https://crawlee.dev/
-import { PlaywrightCrawler } from "crawlee";
+import { PlaywrightCrawler, downloadListOfUrls } from "crawlee";
 import { readFile, writeFile } from "fs/promises";
 import { glob } from "glob";
 import {Config, configSchema} from "./config.js";
@@ -45,7 +45,7 @@ export async function waitForXPath(page: Page, xpath: string, timeout: number) {
   );
 }
 
-export async function crawl(config: Config) {
+export async function crawl(config: Config) { 
   configSchema.parse(config);
 
   if (process.env.NO_CRAWL !== "true") {
@@ -105,10 +105,35 @@ export async function crawl(config: Config) {
       maxRequestsPerCrawl: config.maxPagesToCrawl,
       // Uncomment this option to see the browser window.
       // headless: false,
+      preNavigationHooks: [
+        // Abort requests for certain resource types
+        async ({ page, log }) => {
+          // If there are no resource exclusions, return
+          const RESOURCE_EXCLUSTIONS = config.resourceExclusions ?? [];
+          if (RESOURCE_EXCLUSTIONS.length === 0) {
+            return;
+          }
+          await page.route(`**\/*.{${RESOURCE_EXCLUSTIONS.join()}}`, route => route.abort('aborted'));
+          log.info(`Aborting requests for as this is a resource excluded route`);
+        }
+      ],
     });
 
-    // Add first URL to the queue and start the crawl.
-    await crawler.run([config.url]);
+    const SITEMAP_SUFFIX = "sitemap.xml";
+    const isUrlASitemap = config.url.endsWith(SITEMAP_SUFFIX);
+  
+    if (isUrlASitemap) {
+      const listOfUrls = await downloadListOfUrls({ url: config.url });
+  
+      // Add the initial URL to the crawling queue.
+      await crawler.addRequests(listOfUrls);
+  
+      // Run the crawler
+      await crawler.run();
+    } else {
+      // Add first URL to the queue and start the crawl.
+      await crawler.run([config.url]);
+    }
   }
 }