فهرست منبع

Merge pull request #45 from LeonKohli/main

feat: use Xpath as selector
Steve Sewell 1 سال پیش
والد
کامیت
9e8c3291a1
1فایلهای تغییر یافته به همراه28 افزوده شده و 8 حذف شده
  1. 28 8
      src/main.ts

+ 28 - 8
src/main.ts

@@ -7,11 +7,26 @@ import { Page } from "playwright";
 
 let pageCounter = 0; 
 
-export function getPageHtml(page: Page) {
+export function getPageHtml(page: Page, selector: string) {
   return page.evaluate((selector) => {
-    const el = document.querySelector(selector) as HTMLElement | null;
-    return el?.innerText || "";
-  }, config.selector);
+    // Check if the selector is an XPath
+    if (selector.startsWith('/')) {
+      const elements = document.evaluate(selector, document, null, XPathResult.ANY_TYPE, null);
+      let result = elements.iterateNext();
+      return result ? result.textContent || "" : "";
+    } else {
+      // Handle as a CSS selector
+      const el = document.querySelector(selector) as HTMLElement | null;
+      return el?.innerText || "";
+    }
+  }, selector);
+}
+
+export async function waitForXPath(page: Page, xpath: string, timeout: number) {
+  await page.waitForFunction(xpath => {
+    const elements = document.evaluate(xpath, document, null, XPathResult.ANY_TYPE, null);
+    return elements.iterateNext() !== null;
+  }, xpath, { timeout });
 }
 
 if (process.env.NO_CRAWL !== "true") {
@@ -35,11 +50,16 @@ if (process.env.NO_CRAWL !== "true") {
       pageCounter++;
       log.info(`Crawling: Page ${pageCounter} / ${config.maxPagesToCrawl} - URL: ${request.loadedUrl}...`);
       
-      await page.waitForSelector(config.selector, {
-        timeout: config.waitForSelectorTimeout ?? 1000,
-      });
+      // Use custom handling for XPath selector
+      if (config.selector.startsWith('/')) {
+        await waitForXPath(page, config.selector, config.waitForSelectorTimeout ?? 1000);
+      } else {
+        await page.waitForSelector(config.selector, {
+          timeout: config.waitForSelectorTimeout ?? 1000,
+        });
+      }
 
-      const html = await getPageHtml(page);
+      const html = await getPageHtml(page, config.selector);
 
       // Save results as JSON to ./storage/datasets/default
       await pushData({ title, url: request.loadedUrl, html });