Explorar el Código

Merge pull request #26 from guillermoscript/sitemap-support

Refactor getPageHtml function to handle selector not found case, using body as fallback. Add support for downloading URLs from sitemap.xml. Update comments to let know that sitemap is supported
Steve Sewell hace 1 año
padre
commit
10a71edde2
Se han modificado 4 ficheros con 44 adiciones y 6 borrados
  1. BIN
      .DS_Store
  2. 7 1
      README.md
  3. 8 1
      src/config.ts
  4. 29 4
      src/core.ts

BIN
.DS_Store


La diferencia del archivo ha sido suprimido porque es demasiado grande
+ 7 - 1
README.md


La diferencia del archivo ha sido suprimido porque es demasiado grande
+ 8 - 1
src/config.ts


+ 29 - 4
src/core.ts

@@ -1,5 +1,5 @@
 // For more information, see https://crawlee.dev/
-import { PlaywrightCrawler } from "crawlee";
+import { PlaywrightCrawler, downloadListOfUrls } from "crawlee";
 import { readFile, writeFile } from "fs/promises";
 import { glob } from "glob";
 import {Config, configSchema} from "./config.js";
@@ -45,7 +45,7 @@ export async function waitForXPath(page: Page, xpath: string, timeout: number) {
   );
 }
 
-export async function crawl(config: Config) {
+export async function crawl(config: Config) { 
   configSchema.parse(config);
 
   if (process.env.NO_CRAWL !== "true") {
@@ -105,10 +105,35 @@ export async function crawl(config: Config) {
       maxRequestsPerCrawl: config.maxPagesToCrawl,
       // Uncomment this option to see the browser window.
       // headless: false,
+      preNavigationHooks: [
+        // Abort requests for certain resource types
+        async ({ page, log }) => {
+          // If there are no resource exclusions, return
+          const RESOURCE_EXCLUSTIONS = config.resourceExclusions ?? [];
+          if (RESOURCE_EXCLUSTIONS.length === 0) {
+            return;
+          }
+          await page.route(`**\/*.{${RESOURCE_EXCLUSTIONS.join()}}`, route => route.abort('aborted'));
+          log.info(`Aborting requests for as this is a resource excluded route`);
+        }
+      ],
     });
 
-    // Add first URL to the queue and start the crawl.
-    await crawler.run([config.url]);
+    const SITEMAP_SUFFIX = "sitemap.xml";
+    const isUrlASitemap = config.url.endsWith(SITEMAP_SUFFIX);
+  
+    if (isUrlASitemap) {
+      const listOfUrls = await downloadListOfUrls({ url: config.url });
+  
+      // Add the initial URL to the crawling queue.
+      await crawler.addRequests(listOfUrls);
+  
+      // Run the crawler
+      await crawler.run();
+    } else {
+      // Add first URL to the queue and start the crawl.
+      await crawler.run([config.url]);
+    }
   }
 }