Steve Sewell 1 år sedan
förälder
incheckning
61fbb62273
9 ändrade filer med 116 tillägg och 103 borttagningar
  1. 1 24
      README.md
  2. 2 43
      config.ts
  3. 3 4
      containerapp/README.md
  4. 2 2
      containerapp/data/config.ts
  5. 16 1
      package-lock.json
  6. 4 2
      package.json
  7. 2 5
      src/cli.ts
  8. 41 0
      src/config.ts
  9. 45 22
      src/core.ts

+ 1 - 24
README.md

@@ -1,5 +1,3 @@
-<!-- Markdown written with https://marketplace.visualstudio.com/items?itemName=yzhang.markdown-all-in-one -->
-
 # GPT Crawler <!-- omit from toc -->
 
 Crawl a site to generate knowledge files to create your own custom GPT from one or multiple URLs
@@ -66,7 +64,7 @@ export const defaultConfig: Config = {
 };
 ```
 
-See the top of the file for the type definition for what you can configure:
+See [config.ts](src/config.ts) for all available options. Here is a sample of the common configu options:
 
 ```ts
 type Config = {
@@ -80,15 +78,6 @@ type Config = {
   maxPagesToCrawl: number;
   /** File name for the finished data */
   outputFileName: string;
-  /** Optional cookie to be set. E.g. for Cookie Consent */
-  cookie?: { name: string; value: string };
-  /** Optional function to run for each page found */
-  onVisitPage?: (options: {
-    page: Page;
-    pushData: (data: any) => Promise<void>;
-  }) => Promise<void>;
-  /** Optional timeout for waiting for a selector to appear */
-  waitForSelectorTimeout?: number;
 };
 ```
 
@@ -104,18 +93,6 @@ npm start
 
 To obtain the `output.json` with a containerized execution. Go into the `containerapp` directory. Modify the `config.ts` same as above, the `output.json`file should be generated in the data folder. Note : the `outputFileName` property in the `config.ts` file in containerapp folder is configured to work with the container.
 
-#### Running as a CLI
-
-<!-- TODO: Needs to be actually published -->
-
-##### Development
-
-To run the CLI locally while developing it:
-  
-```sh
-npm run start:cli --url https://www.builder.io/c/docs/developers --match https://www.builder.io/c/docs/** --selector .docs-builder-container --maxPagesToCrawl 50 --outputFileName output.json
-```
-
 ### Upload your data to OpenAI
 
 The crawl will generate a file called `output.json` at the root of this project. Upload that [to OpenAI](https://platform.openai.com/docs/assistants/overview) to create your custom assistant or custom GPT.

+ 2 - 43
config.ts

@@ -1,49 +1,8 @@
-import { Page } from "playwright";
-
-export type Config = {
-  /**
-   * URL to start the crawl
-   * @example "https://www.builder.io/c/docs/developers"
-   * @default ""
-   */
-  url: string;
-  /**
-   * Pattern to match against for links on a page to subsequently crawl
-   * @example "https://www.builder.io/c/docs/**"
-   * @default ""
-   */
-  match: string | string[];
-  /**
-   * Selector to grab the inner text from
-   * @example ".docs-builder-container"
-   * @default ""
-   */
-  selector: string;
-  /**
-   * Don't crawl more than this many pages
-   * @default 50
-   */
-  maxPagesToCrawl: number;
-  /**
-   * File name for the finished data
-   * @default "output.json"
-   */
-  outputFileName: string;
-  /** Optional cookie to be set. E.g. for Cookie Consent */
-  cookie?: { name: string; value: string };
-  /** Optional function to run for each page found */
-  onVisitPage?: (options: {
-    page: Page;
-    pushData: (data: any) => Promise<void>;
-  }) => Promise<void>;
-  /** Optional timeout for waiting for a selector to appear */
-  waitForSelectorTimeout?: number;
-};
+import { Config } from "./src/config";
 
 export const defaultConfig: Config = {
   url: "https://www.builder.io/c/docs/developers",
   match: "https://www.builder.io/c/docs/**",
-  selector: `.docs-builder-container`,
   maxPagesToCrawl: 50,
-  outputFileName: "../output.json",
+  outputFileName: "output.json",
 };

+ 3 - 4
containerapp/README.md

@@ -1,15 +1,14 @@
 # Containerized crawler
-## Docker image with packaged crawler, with script for building and execution.
 
+## Docker image with packaged crawler, with script for building and execution.
 
 All dependencies set up and configured in the Dockerfile. Requires docker to be installed.
 
-
 ## Get started
 
 ### Prerequisites
 
 Be sure you have docker installed
 
-1. ``` cd gpt-crawler/containerapp  ```
-2. ``` . ./run.sh  ```
+1. `cd gpt-crawler/containerapp `
+2. `. ./run.sh `

+ 2 - 2
containerapp/data/config.ts

@@ -12,13 +12,13 @@ type Config = {
   /** File name for the finished data */
   outputFileName: string;
   /** Optional cookie to be set. E.g. for Cookie Consent */
-  cookie?: {name: string; value: string}
+  cookie?: { name: string; value: string };
   /** Optional function to run for each page found */
   onVisitPage?: (options: {
     page: Page;
     pushData: (data: any) => Promise<void>;
   }) => Promise<void>;
-    /** Optional timeout for waiting for a selector to appear */
+  /** Optional timeout for waiting for a selector to appear */
   waitForSelectorTimeout?: number;
 };
 

+ 16 - 1
package-lock.json

@@ -14,7 +14,8 @@
         "crawlee": "^3.0.0",
         "glob": "^10.3.10",
         "inquirer": "^9.2.12",
-        "playwright": "*"
+        "playwright": "*",
+        "prettier": "^3.1.0"
       },
       "bin": {
         "gpt-crawler": "dist/src/cli.js"
@@ -2761,6 +2762,20 @@
         "node": ">=16"
       }
     },
+    "node_modules/prettier": {
+      "version": "3.1.0",
+      "resolved": "https://registry.npmjs.org/prettier/-/prettier-3.1.0.tgz",
+      "integrity": "sha512-TQLvXjq5IAibjh8EpBIkNKxO749UEWABoiIZehEPiY4GNpVdhaFKqSTu+QrlU6D2dPAfubRmtJTi4K4YkQ5eXw==",
+      "bin": {
+        "prettier": "bin/prettier.cjs"
+      },
+      "engines": {
+        "node": ">=14"
+      },
+      "funding": {
+        "url": "https://github.com/prettier/prettier?sponsor=1"
+      }
+    },
     "node_modules/proper-lockfile": {
       "version": "4.1.2",
       "license": "MIT",

+ 4 - 2
package.json

@@ -11,7 +11,8 @@
     "crawlee": "^3.0.0",
     "glob": "^10.3.10",
     "inquirer": "^9.2.12",
-    "playwright": "*"
+    "playwright": "*",
+    "prettier": "^3.1.0"
   },
   "devDependencies": {
     "@apify/tsconfig": "^0.1.0",
@@ -26,7 +27,8 @@
     "start:cli": "NODE_ENV=development npm run build && node dist/src/cli.js",
     "start:dev": "NODE_ENV=development npm run build && node dist/src/main.js",
     "start:prod": "node dist/main.js",
-    "build": "tsc"
+    "build": "tsc",
+    "fmt": "prettier --write ."
   },
   "author": "It's not you it's me",
   "license": "ISC"

+ 2 - 5
src/cli.ts

@@ -65,8 +65,7 @@ async function handler(options: Config) {
         });
       }
 
-      const answers = await inquirer
-        .prompt(questions);
+      const answers = await inquirer.prompt(questions);
 
       config = {
         ...config,
@@ -81,9 +80,7 @@ async function handler(options: Config) {
   }
 }
 
-program
-  .version(version)
-  .description(description);
+program.version(version).description(description);
 
 program
   .option("-u, --url <string>", messages.url, "")

+ 41 - 0
src/config.ts

@@ -0,0 +1,41 @@
+import type { Page } from "playwright";
+
+export type Config = {
+  /**
+   * URL to start the crawl
+   * @example "https://www.builder.io/c/docs/developers"
+   * @default ""
+   */
+  url: string;
+  /**
+   * Pattern to match against for links on a page to subsequently crawl
+   * @example "https://www.builder.io/c/docs/**"
+   * @default ""
+   */
+  match: string | string[];
+  /**
+   * Selector to grab the inner text from
+   * @example ".docs-builder-container"
+   * @default ""
+   */
+  selector?: string;
+  /**
+   * Don't crawl more than this many pages
+   * @default 50
+   */
+  maxPagesToCrawl: number;
+  /**
+   * File name for the finished data
+   * @default "output.json"
+   */
+  outputFileName: string;
+  /** Optional cookie to be set. E.g. for Cookie Consent */
+  cookie?: { name: string; value: string };
+  /** Optional function to run for each page found */
+  onVisitPage?: (options: {
+    page: Page;
+    pushData: (data: any) => Promise<void>;
+  }) => Promise<void>;
+  /** Optional timeout for waiting for a selector to appear */
+  waitForSelectorTimeout?: number;
+};

+ 45 - 22
src/core.ts

@@ -5,13 +5,19 @@ import { glob } from "glob";
 import { Config } from "../config.js";
 import { Page } from "playwright";
 
-let pageCounter = 0; 
+let pageCounter = 0;
 
 export function getPageHtml(page: Page, selector: string) {
   return page.evaluate((selector) => {
     // Check if the selector is an XPath
-    if (selector.startsWith('/')) {
-      const elements = document.evaluate(selector, document, null, XPathResult.ANY_TYPE, null);
+    if (selector.startsWith("/")) {
+      const elements = document.evaluate(
+        selector,
+        document,
+        null,
+        XPathResult.ANY_TYPE,
+        null,
+      );
       let result = elements.iterateNext();
       return result ? result.textContent || "" : "";
     } else {
@@ -23,10 +29,20 @@ export function getPageHtml(page: Page, selector: string) {
 }
 
 export async function waitForXPath(page: Page, xpath: string, timeout: number) {
-  await page.waitForFunction(xpath => {
-    const elements = document.evaluate(xpath, document, null, XPathResult.ANY_TYPE, null);
-    return elements.iterateNext() !== null;
-  }, xpath, { timeout });
+  await page.waitForFunction(
+    (xpath) => {
+      const elements = document.evaluate(
+        xpath,
+        document,
+        null,
+        XPathResult.ANY_TYPE,
+        null,
+      );
+      return elements.iterateNext() !== null;
+    },
+    xpath,
+    { timeout },
+  );
 }
 
 export async function crawl(config: Config) {
@@ -41,37 +57,44 @@ export async function crawl(config: Config) {
           const cookie = {
             name: config.cookie.name,
             value: config.cookie.value,
-            url: request.loadedUrl, 
+            url: request.loadedUrl,
           };
           await page.context().addCookies([cookie]);
         }
-  
+
         const title = await page.title();
         pageCounter++;
-        log.info(`Crawling: Page ${pageCounter} / ${config.maxPagesToCrawl} - URL: ${request.loadedUrl}...`);
-        
+        log.info(
+          `Crawling: Page ${pageCounter} / ${config.maxPagesToCrawl} - URL: ${request.loadedUrl}...`,
+        );
+
         // Use custom handling for XPath selector
-        if (config.selector.startsWith('/')) {
-          await waitForXPath(page, config.selector, config.waitForSelectorTimeout ?? 1000);
+        if (config.selector.startsWith("/")) {
+          await waitForXPath(
+            page,
+            config.selector,
+            config.waitForSelectorTimeout ?? 1000,
+          );
         } else {
           await page.waitForSelector(config.selector, {
             timeout: config.waitForSelectorTimeout ?? 1000,
           });
         }
-  
+
         const html = await getPageHtml(page, config.selector);
-  
+
         // Save results as JSON to ./storage/datasets/default
         await pushData({ title, url: request.loadedUrl, html });
-  
+
         if (config.onVisitPage) {
           await config.onVisitPage({ page, pushData });
         }
-  
+
         // Extract links from the current page
         // and add them to the crawling queue.
         await enqueueLinks({
-          globs: typeof config.match === "string" ? [config.match] : config.match,
+          globs:
+            typeof config.match === "string" ? [config.match] : config.match,
         });
       },
       // Comment this option to scrape the full website.
@@ -79,22 +102,22 @@ export async function crawl(config: Config) {
       // Uncomment this option to see the browser window.
       // headless: false,
     });
-  
+
     // Add first URL to the queue and start the crawl.
     await crawler.run([config.url]);
-  }  
+  }
 }
 
 export async function write(config: Config) {
   const jsonFiles = await glob("storage/datasets/default/*.json", {
     absolute: true,
   });
-  
+
   const results = [];
   for (const file of jsonFiles) {
     const data = JSON.parse(await readFile(file, "utf-8"));
     results.push(data);
   }
-  
+
   await writeFile(config.outputFileName, JSON.stringify(results, null, 2));
 }