Steve Sewell 1 년 전
부모
커밋
584c3d3e6e
2개의 변경된 파일19개의 추가작업 그리고 17개의 파일을 삭제
  1. 1 1
      src/cli.ts
  2. 18 16
      src/core.ts

+ 1 - 1
src/cli.ts

@@ -1,7 +1,7 @@
 #!/usr/bin/env node
 
 import { program } from "commander";
-import { Config } from "../config.js";
+import { Config } from "./config.js";
 import { crawl, write } from "./core.js";
 import { createRequire } from "node:module";
 import inquirer from "inquirer";

+ 18 - 16
src/core.ts

@@ -2,12 +2,12 @@
 import { PlaywrightCrawler } from "crawlee";
 import { readFile, writeFile } from "fs/promises";
 import { glob } from "glob";
-import { Config } from "../config.js";
+import { Config } from "./config.js";
 import { Page } from "playwright";
 
 let pageCounter = 0;
 
-export function getPageHtml(page: Page, selector: string) {
+export function getPageHtml(page: Page, selector = "body") {
   return page.evaluate((selector) => {
     // Check if the selector is an XPath
     if (selector.startsWith("/")) {
@@ -16,7 +16,7 @@ export function getPageHtml(page: Page, selector: string) {
         document,
         null,
         XPathResult.ANY_TYPE,
-        null,
+        null
       );
       let result = elements.iterateNext();
       return result ? result.textContent || "" : "";
@@ -36,12 +36,12 @@ export async function waitForXPath(page: Page, xpath: string, timeout: number) {
         document,
         null,
         XPathResult.ANY_TYPE,
-        null,
+        null
       );
       return elements.iterateNext() !== null;
     },
     xpath,
-    { timeout },
+    { timeout }
   );
 }
 
@@ -65,20 +65,22 @@ export async function crawl(config: Config) {
         const title = await page.title();
         pageCounter++;
         log.info(
-          `Crawling: Page ${pageCounter} / ${config.maxPagesToCrawl} - URL: ${request.loadedUrl}...`,
+          `Crawling: Page ${pageCounter} / ${config.maxPagesToCrawl} - URL: ${request.loadedUrl}...`
         );
 
         // Use custom handling for XPath selector
-        if (config.selector.startsWith("/")) {
-          await waitForXPath(
-            page,
-            config.selector,
-            config.waitForSelectorTimeout ?? 1000,
-          );
-        } else {
-          await page.waitForSelector(config.selector, {
-            timeout: config.waitForSelectorTimeout ?? 1000,
-          });
+        if (config.selector) {
+          if (config.selector.startsWith("/")) {
+            await waitForXPath(
+              page,
+              config.selector,
+              config.waitForSelectorTimeout ?? 1000
+            );
+          } else {
+            await page.waitForSelector(config.selector, {
+              timeout: config.waitForSelectorTimeout ?? 1000,
+            });
+          }
         }
 
         const html = await getPageHtml(page, config.selector);