Steve Sewell 1 рік тому
батько
коміт
61fbb62273
9 змінених файлів з 116 додано та 103 видалено
  1. 1 24
      README.md
  2. 2 43
      config.ts
  3. 3 4
      containerapp/README.md
  4. 2 2
      containerapp/data/config.ts
  5. 16 1
      package-lock.json
  6. 4 2
      package.json
  7. 2 5
      src/cli.ts
  8. 41 0
      src/config.ts
  9. 45 22
      src/core.ts

+ 1 - 24
README.md

@@ -1,5 +1,3 @@
-<!-- Markdown written with https://marketplace.visualstudio.com/items?itemName=yzhang.markdown-all-in-one -->
-
 # GPT Crawler <!-- omit from toc -->
 # GPT Crawler <!-- omit from toc -->
 
 
 Crawl a site to generate knowledge files to create your own custom GPT from one or multiple URLs
 Crawl a site to generate knowledge files to create your own custom GPT from one or multiple URLs
@@ -66,7 +64,7 @@ export const defaultConfig: Config = {
 };
 };
 ```
 ```
 
 
-See the top of the file for the type definition for what you can configure:
+See [config.ts](src/config.ts) for all available options. Here is a sample of the common configu options:
 
 
 ```ts
 ```ts
 type Config = {
 type Config = {
@@ -80,15 +78,6 @@ type Config = {
   maxPagesToCrawl: number;
   maxPagesToCrawl: number;
   /** File name for the finished data */
   /** File name for the finished data */
   outputFileName: string;
   outputFileName: string;
-  /** Optional cookie to be set. E.g. for Cookie Consent */
-  cookie?: { name: string; value: string };
-  /** Optional function to run for each page found */
-  onVisitPage?: (options: {
-    page: Page;
-    pushData: (data: any) => Promise<void>;
-  }) => Promise<void>;
-  /** Optional timeout for waiting for a selector to appear */
-  waitForSelectorTimeout?: number;
 };
 };
 ```
 ```
 
 
@@ -104,18 +93,6 @@ npm start
 
 
 To obtain the `output.json` with a containerized execution. Go into the `containerapp` directory. Modify the `config.ts` same as above, the `output.json`file should be generated in the data folder. Note : the `outputFileName` property in the `config.ts` file in containerapp folder is configured to work with the container.
 To obtain the `output.json` with a containerized execution. Go into the `containerapp` directory. Modify the `config.ts` same as above, the `output.json`file should be generated in the data folder. Note : the `outputFileName` property in the `config.ts` file in containerapp folder is configured to work with the container.
 
 
-#### Running as a CLI
-
-<!-- TODO: Needs to be actually published -->
-
-##### Development
-
-To run the CLI locally while developing it:
-  
-```sh
-npm run start:cli --url https://www.builder.io/c/docs/developers --match https://www.builder.io/c/docs/** --selector .docs-builder-container --maxPagesToCrawl 50 --outputFileName output.json
-```
-
 ### Upload your data to OpenAI
 ### Upload your data to OpenAI
 
 
 The crawl will generate a file called `output.json` at the root of this project. Upload that [to OpenAI](https://platform.openai.com/docs/assistants/overview) to create your custom assistant or custom GPT.
 The crawl will generate a file called `output.json` at the root of this project. Upload that [to OpenAI](https://platform.openai.com/docs/assistants/overview) to create your custom assistant or custom GPT.

+ 2 - 43
config.ts

@@ -1,49 +1,8 @@
-import { Page } from "playwright";
-
-export type Config = {
-  /**
-   * URL to start the crawl
-   * @example "https://www.builder.io/c/docs/developers"
-   * @default ""
-   */
-  url: string;
-  /**
-   * Pattern to match against for links on a page to subsequently crawl
-   * @example "https://www.builder.io/c/docs/**"
-   * @default ""
-   */
-  match: string | string[];
-  /**
-   * Selector to grab the inner text from
-   * @example ".docs-builder-container"
-   * @default ""
-   */
-  selector: string;
-  /**
-   * Don't crawl more than this many pages
-   * @default 50
-   */
-  maxPagesToCrawl: number;
-  /**
-   * File name for the finished data
-   * @default "output.json"
-   */
-  outputFileName: string;
-  /** Optional cookie to be set. E.g. for Cookie Consent */
-  cookie?: { name: string; value: string };
-  /** Optional function to run for each page found */
-  onVisitPage?: (options: {
-    page: Page;
-    pushData: (data: any) => Promise<void>;
-  }) => Promise<void>;
-  /** Optional timeout for waiting for a selector to appear */
-  waitForSelectorTimeout?: number;
-};
+import { Config } from "./src/config";
 
 
 export const defaultConfig: Config = {
 export const defaultConfig: Config = {
   url: "https://www.builder.io/c/docs/developers",
   url: "https://www.builder.io/c/docs/developers",
   match: "https://www.builder.io/c/docs/**",
   match: "https://www.builder.io/c/docs/**",
-  selector: `.docs-builder-container`,
   maxPagesToCrawl: 50,
   maxPagesToCrawl: 50,
-  outputFileName: "../output.json",
+  outputFileName: "output.json",
 };
 };

+ 3 - 4
containerapp/README.md

@@ -1,15 +1,14 @@
 # Containerized crawler
 # Containerized crawler
-## Docker image with packaged crawler, with script for building and execution.
 
 
+## Docker image with packaged crawler, with script for building and execution.
 
 
 All dependencies set up and configured in the Dockerfile. Requires docker to be installed.
 All dependencies set up and configured in the Dockerfile. Requires docker to be installed.
 
 
-
 ## Get started
 ## Get started
 
 
 ### Prerequisites
 ### Prerequisites
 
 
 Be sure you have docker installed
 Be sure you have docker installed
 
 
-1. ``` cd gpt-crawler/containerapp  ```
-2. ``` . ./run.sh  ```
+1. `cd gpt-crawler/containerapp `
+2. `. ./run.sh `

+ 2 - 2
containerapp/data/config.ts

@@ -12,13 +12,13 @@ type Config = {
   /** File name for the finished data */
   /** File name for the finished data */
   outputFileName: string;
   outputFileName: string;
   /** Optional cookie to be set. E.g. for Cookie Consent */
   /** Optional cookie to be set. E.g. for Cookie Consent */
-  cookie?: {name: string; value: string}
+  cookie?: { name: string; value: string };
   /** Optional function to run for each page found */
   /** Optional function to run for each page found */
   onVisitPage?: (options: {
   onVisitPage?: (options: {
     page: Page;
     page: Page;
     pushData: (data: any) => Promise<void>;
     pushData: (data: any) => Promise<void>;
   }) => Promise<void>;
   }) => Promise<void>;
-    /** Optional timeout for waiting for a selector to appear */
+  /** Optional timeout for waiting for a selector to appear */
   waitForSelectorTimeout?: number;
   waitForSelectorTimeout?: number;
 };
 };
 
 

+ 16 - 1
package-lock.json

@@ -14,7 +14,8 @@
         "crawlee": "^3.0.0",
         "crawlee": "^3.0.0",
         "glob": "^10.3.10",
         "glob": "^10.3.10",
         "inquirer": "^9.2.12",
         "inquirer": "^9.2.12",
-        "playwright": "*"
+        "playwright": "*",
+        "prettier": "^3.1.0"
       },
       },
       "bin": {
       "bin": {
         "gpt-crawler": "dist/src/cli.js"
         "gpt-crawler": "dist/src/cli.js"
@@ -2761,6 +2762,20 @@
         "node": ">=16"
         "node": ">=16"
       }
       }
     },
     },
+    "node_modules/prettier": {
+      "version": "3.1.0",
+      "resolved": "https://registry.npmjs.org/prettier/-/prettier-3.1.0.tgz",
+      "integrity": "sha512-TQLvXjq5IAibjh8EpBIkNKxO749UEWABoiIZehEPiY4GNpVdhaFKqSTu+QrlU6D2dPAfubRmtJTi4K4YkQ5eXw==",
+      "bin": {
+        "prettier": "bin/prettier.cjs"
+      },
+      "engines": {
+        "node": ">=14"
+      },
+      "funding": {
+        "url": "https://github.com/prettier/prettier?sponsor=1"
+      }
+    },
     "node_modules/proper-lockfile": {
     "node_modules/proper-lockfile": {
       "version": "4.1.2",
       "version": "4.1.2",
       "license": "MIT",
       "license": "MIT",

+ 4 - 2
package.json

@@ -11,7 +11,8 @@
     "crawlee": "^3.0.0",
     "crawlee": "^3.0.0",
     "glob": "^10.3.10",
     "glob": "^10.3.10",
     "inquirer": "^9.2.12",
     "inquirer": "^9.2.12",
-    "playwright": "*"
+    "playwright": "*",
+    "prettier": "^3.1.0"
   },
   },
   "devDependencies": {
   "devDependencies": {
     "@apify/tsconfig": "^0.1.0",
     "@apify/tsconfig": "^0.1.0",
@@ -26,7 +27,8 @@
     "start:cli": "NODE_ENV=development npm run build && node dist/src/cli.js",
     "start:cli": "NODE_ENV=development npm run build && node dist/src/cli.js",
     "start:dev": "NODE_ENV=development npm run build && node dist/src/main.js",
     "start:dev": "NODE_ENV=development npm run build && node dist/src/main.js",
     "start:prod": "node dist/main.js",
     "start:prod": "node dist/main.js",
-    "build": "tsc"
+    "build": "tsc",
+    "fmt": "prettier --write ."
   },
   },
   "author": "It's not you it's me",
   "author": "It's not you it's me",
   "license": "ISC"
   "license": "ISC"

+ 2 - 5
src/cli.ts

@@ -65,8 +65,7 @@ async function handler(options: Config) {
         });
         });
       }
       }
 
 
-      const answers = await inquirer
-        .prompt(questions);
+      const answers = await inquirer.prompt(questions);
 
 
       config = {
       config = {
         ...config,
         ...config,
@@ -81,9 +80,7 @@ async function handler(options: Config) {
   }
   }
 }
 }
 
 
-program
-  .version(version)
-  .description(description);
+program.version(version).description(description);
 
 
 program
 program
   .option("-u, --url <string>", messages.url, "")
   .option("-u, --url <string>", messages.url, "")

+ 41 - 0
src/config.ts

@@ -0,0 +1,41 @@
+import type { Page } from "playwright";
+
+export type Config = {
+  /**
+   * URL to start the crawl
+   * @example "https://www.builder.io/c/docs/developers"
+   * @default ""
+   */
+  url: string;
+  /**
+   * Pattern to match against for links on a page to subsequently crawl
+   * @example "https://www.builder.io/c/docs/**"
+   * @default ""
+   */
+  match: string | string[];
+  /**
+   * Selector to grab the inner text from
+   * @example ".docs-builder-container"
+   * @default ""
+   */
+  selector?: string;
+  /**
+   * Don't crawl more than this many pages
+   * @default 50
+   */
+  maxPagesToCrawl: number;
+  /**
+   * File name for the finished data
+   * @default "output.json"
+   */
+  outputFileName: string;
+  /** Optional cookie to be set. E.g. for Cookie Consent */
+  cookie?: { name: string; value: string };
+  /** Optional function to run for each page found */
+  onVisitPage?: (options: {
+    page: Page;
+    pushData: (data: any) => Promise<void>;
+  }) => Promise<void>;
+  /** Optional timeout for waiting for a selector to appear */
+  waitForSelectorTimeout?: number;
+};

+ 45 - 22
src/core.ts

@@ -5,13 +5,19 @@ import { glob } from "glob";
 import { Config } from "../config.js";
 import { Config } from "../config.js";
 import { Page } from "playwright";
 import { Page } from "playwright";
 
 
-let pageCounter = 0; 
+let pageCounter = 0;
 
 
 export function getPageHtml(page: Page, selector: string) {
 export function getPageHtml(page: Page, selector: string) {
   return page.evaluate((selector) => {
   return page.evaluate((selector) => {
     // Check if the selector is an XPath
     // Check if the selector is an XPath
-    if (selector.startsWith('/')) {
-      const elements = document.evaluate(selector, document, null, XPathResult.ANY_TYPE, null);
+    if (selector.startsWith("/")) {
+      const elements = document.evaluate(
+        selector,
+        document,
+        null,
+        XPathResult.ANY_TYPE,
+        null,
+      );
       let result = elements.iterateNext();
       let result = elements.iterateNext();
       return result ? result.textContent || "" : "";
       return result ? result.textContent || "" : "";
     } else {
     } else {
@@ -23,10 +29,20 @@ export function getPageHtml(page: Page, selector: string) {
 }
 }
 
 
 export async function waitForXPath(page: Page, xpath: string, timeout: number) {
 export async function waitForXPath(page: Page, xpath: string, timeout: number) {
-  await page.waitForFunction(xpath => {
-    const elements = document.evaluate(xpath, document, null, XPathResult.ANY_TYPE, null);
-    return elements.iterateNext() !== null;
-  }, xpath, { timeout });
+  await page.waitForFunction(
+    (xpath) => {
+      const elements = document.evaluate(
+        xpath,
+        document,
+        null,
+        XPathResult.ANY_TYPE,
+        null,
+      );
+      return elements.iterateNext() !== null;
+    },
+    xpath,
+    { timeout },
+  );
 }
 }
 
 
 export async function crawl(config: Config) {
 export async function crawl(config: Config) {
@@ -41,37 +57,44 @@ export async function crawl(config: Config) {
           const cookie = {
           const cookie = {
             name: config.cookie.name,
             name: config.cookie.name,
             value: config.cookie.value,
             value: config.cookie.value,
-            url: request.loadedUrl, 
+            url: request.loadedUrl,
           };
           };
           await page.context().addCookies([cookie]);
           await page.context().addCookies([cookie]);
         }
         }
-  
+
         const title = await page.title();
         const title = await page.title();
         pageCounter++;
         pageCounter++;
-        log.info(`Crawling: Page ${pageCounter} / ${config.maxPagesToCrawl} - URL: ${request.loadedUrl}...`);
-        
+        log.info(
+          `Crawling: Page ${pageCounter} / ${config.maxPagesToCrawl} - URL: ${request.loadedUrl}...`,
+        );
+
         // Use custom handling for XPath selector
         // Use custom handling for XPath selector
-        if (config.selector.startsWith('/')) {
-          await waitForXPath(page, config.selector, config.waitForSelectorTimeout ?? 1000);
+        if (config.selector.startsWith("/")) {
+          await waitForXPath(
+            page,
+            config.selector,
+            config.waitForSelectorTimeout ?? 1000,
+          );
         } else {
         } else {
           await page.waitForSelector(config.selector, {
           await page.waitForSelector(config.selector, {
             timeout: config.waitForSelectorTimeout ?? 1000,
             timeout: config.waitForSelectorTimeout ?? 1000,
           });
           });
         }
         }
-  
+
         const html = await getPageHtml(page, config.selector);
         const html = await getPageHtml(page, config.selector);
-  
+
         // Save results as JSON to ./storage/datasets/default
         // Save results as JSON to ./storage/datasets/default
         await pushData({ title, url: request.loadedUrl, html });
         await pushData({ title, url: request.loadedUrl, html });
-  
+
         if (config.onVisitPage) {
         if (config.onVisitPage) {
           await config.onVisitPage({ page, pushData });
           await config.onVisitPage({ page, pushData });
         }
         }
-  
+
         // Extract links from the current page
         // Extract links from the current page
         // and add them to the crawling queue.
         // and add them to the crawling queue.
         await enqueueLinks({
         await enqueueLinks({
-          globs: typeof config.match === "string" ? [config.match] : config.match,
+          globs:
+            typeof config.match === "string" ? [config.match] : config.match,
         });
         });
       },
       },
       // Comment this option to scrape the full website.
       // Comment this option to scrape the full website.
@@ -79,22 +102,22 @@ export async function crawl(config: Config) {
       // Uncomment this option to see the browser window.
       // Uncomment this option to see the browser window.
       // headless: false,
       // headless: false,
     });
     });
-  
+
     // Add first URL to the queue and start the crawl.
     // Add first URL to the queue and start the crawl.
     await crawler.run([config.url]);
     await crawler.run([config.url]);
-  }  
+  }
 }
 }
 
 
 export async function write(config: Config) {
 export async function write(config: Config) {
   const jsonFiles = await glob("storage/datasets/default/*.json", {
   const jsonFiles = await glob("storage/datasets/default/*.json", {
     absolute: true,
     absolute: true,
   });
   });
-  
+
   const results = [];
   const results = [];
   for (const file of jsonFiles) {
   for (const file of jsonFiles) {
     const data = JSON.parse(await readFile(file, "utf-8"));
     const data = JSON.parse(await readFile(file, "utf-8"));
     results.push(data);
     results.push(data);
   }
   }
-  
+
   await writeFile(config.outputFileName, JSON.stringify(results, null, 2));
   await writeFile(config.outputFileName, JSON.stringify(results, null, 2));
 }
 }