2 år sedan · 61fbb62273
--- a/README.md
+++ b/README.md
@@ -1,5 +1,3 @@
 
				-<!-- Markdown written with https://marketplace.visualstudio.com/items?itemName=yzhang.markdown-all-in-one -->
			
 
				-
			
 
				 # GPT Crawler <!-- omit from toc -->
			
 
				 
			
 
				 Crawl a site to generate knowledge files to create your own custom GPT from one or multiple URLs
			
@@ -66,7 +64,7 @@ export const defaultConfig: Config = {
 
				 };
			
 
				 ```
			
 
				 
			
 
				-See the top of the file for the type definition for what you can configure:
			
 
				+See [config.ts](src/config.ts) for all available options. Here is a sample of the common configu options:
			
 
				 
			
 
				 ```ts
			
 
				 type Config = {
			
@@ -80,15 +78,6 @@ type Config = {
 
				   maxPagesToCrawl: number;
			
 
				   /** File name for the finished data */
			
 
				   outputFileName: string;
			
 
				-  /** Optional cookie to be set. E.g. for Cookie Consent */
			
 
				-  cookie?: { name: string; value: string };
			
 
				-  /** Optional function to run for each page found */
			
 
				-  onVisitPage?: (options: {
			
 
				-    page: Page;
			
 
				-    pushData: (data: any) => Promise<void>;
			
 
				-  }) => Promise<void>;
			
 
				-  /** Optional timeout for waiting for a selector to appear */
			
 
				-  waitForSelectorTimeout?: number;
			
 
				 };
			
 
				 ```
			
 
				 
			
@@ -104,18 +93,6 @@ npm start
 
				 
			
 
				 To obtain the `output.json` with a containerized execution. Go into the `containerapp` directory. Modify the `config.ts` same as above, the `output.json`file should be generated in the data folder. Note : the `outputFileName` property in the `config.ts` file in containerapp folder is configured to work with the container.
			
 
				 
			
 
				-#### Running as a CLI
			
 
				-
			
 
				-<!-- TODO: Needs to be actually published -->
			
 
				-
			
 
				-##### Development
			
 
				-
			
 
				-To run the CLI locally while developing it:
			
 
				-  
			
 
				-```sh
			
 
				-npm run start:cli --url https://www.builder.io/c/docs/developers --match https://www.builder.io/c/docs/** --selector .docs-builder-container --maxPagesToCrawl 50 --outputFileName output.json
			
 
				-```
			
 
				-
			
 
				 ### Upload your data to OpenAI
			
 
				 
			
 
				 The crawl will generate a file called `output.json` at the root of this project. Upload that [to OpenAI](https://platform.openai.com/docs/assistants/overview) to create your custom assistant or custom GPT.
			
--- a/config.ts
+++ b/config.ts
@@ -1,49 +1,8 @@
 
				-import { Page } from "playwright";
			
 
				-
			
 
				-export type Config = {
			
 
				-  /**
			
 
				-   * URL to start the crawl
			
 
				-   * @example "https://www.builder.io/c/docs/developers"
			
 
				-   * @default ""
			
 
				-   */
			
 
				-  url: string;
			
 
				-  /**
			
 
				-   * Pattern to match against for links on a page to subsequently crawl
			
 
				-   * @example "https://www.builder.io/c/docs/**"
			
 
				-   * @default ""
			
 
				-   */
			
 
				-  match: string | string[];
			
 
				-  /**
			
 
				-   * Selector to grab the inner text from
			
 
				-   * @example ".docs-builder-container"
			
 
				-   * @default ""
			
 
				-   */
			
 
				-  selector: string;
			
 
				-  /**
			
 
				-   * Don't crawl more than this many pages
			
 
				-   * @default 50
			
 
				-   */
			
 
				-  maxPagesToCrawl: number;
			
 
				-  /**
			
 
				-   * File name for the finished data
			
 
				-   * @default "output.json"
			
 
				-   */
			
 
				-  outputFileName: string;
			
 
				-  /** Optional cookie to be set. E.g. for Cookie Consent */
			
 
				-  cookie?: { name: string; value: string };
			
 
				-  /** Optional function to run for each page found */
			
 
				-  onVisitPage?: (options: {
			
 
				-    page: Page;
			
 
				-    pushData: (data: any) => Promise<void>;
			
 
				-  }) => Promise<void>;
			
 
				-  /** Optional timeout for waiting for a selector to appear */
			
 
				-  waitForSelectorTimeout?: number;
			
 
				-};
			
 
				+import { Config } from "./src/config";
			
 
				 
			
 
				 export const defaultConfig: Config = {
			
 
				   url: "https://www.builder.io/c/docs/developers",
			
 
				   match: "https://www.builder.io/c/docs/**",
			
 
				-  selector: `.docs-builder-container`,
			
 
				   maxPagesToCrawl: 50,
			
 
				-  outputFileName: "../output.json",
			
 
				+  outputFileName: "output.json",
			
 
				 };
			
--- a/containerapp/README.md
+++ b/containerapp/README.md
@@ -1,15 +1,14 @@
 
				 # Containerized crawler
			
 
				-## Docker image with packaged crawler, with script for building and execution.
			
 
				 
			
 
				+## Docker image with packaged crawler, with script for building and execution.
			
 
				 
			
 
				 All dependencies set up and configured in the Dockerfile. Requires docker to be installed.
			
 
				 
			
 
				-
			
 
				 ## Get started
			
 
				 
			
 
				 ### Prerequisites
			
 
				 
			
 
				 Be sure you have docker installed
			
 
				 
			
 
				-1. ``` cd gpt-crawler/containerapp  ```
			
 
				-2. ``` . ./run.sh  ```
			
 
				+1. `cd gpt-crawler/containerapp `
			
 
				+2. `. ./run.sh `
			
--- a/containerapp/data/config.ts
+++ b/containerapp/data/config.ts
@@ -12,13 +12,13 @@ type Config = {
 
				   /** File name for the finished data */
			
 
				   outputFileName: string;
			
 
				   /** Optional cookie to be set. E.g. for Cookie Consent */
			
 
				-  cookie?: {name: string; value: string}
			
 
				+  cookie?: { name: string; value: string };
			
 
				   /** Optional function to run for each page found */
			
 
				   onVisitPage?: (options: {
			
 
				     page: Page;
			
 
				     pushData: (data: any) => Promise<void>;
			
 
				   }) => Promise<void>;
			
 
				-    /** Optional timeout for waiting for a selector to appear */
			
 
				+  /** Optional timeout for waiting for a selector to appear */
			
 
				   waitForSelectorTimeout?: number;
			
 
				 };
			
 
				 
			
--- a/package-lock.json
+++ b/package-lock.json
@@ -14,7 +14,8 @@
 
				         "crawlee": "^3.0.0",
			
 
				         "glob": "^10.3.10",
			
 
				         "inquirer": "^9.2.12",
			
 
				-        "playwright": "*"
			
 
				+        "playwright": "*",
			
 
				+        "prettier": "^3.1.0"
			
 
				       },
			
 
				       "bin": {
			
 
				         "gpt-crawler": "dist/src/cli.js"
			
@@ -2761,6 +2762,20 @@
 
				         "node": ">=16"
			
 
				       }
			
 
				     },
			
 
				+    "node_modules/prettier": {
			
 
				+      "version": "3.1.0",
			
 
				+      "resolved": "https://registry.npmjs.org/prettier/-/prettier-3.1.0.tgz",
			
 
				+      "integrity": "sha512-TQLvXjq5IAibjh8EpBIkNKxO749UEWABoiIZehEPiY4GNpVdhaFKqSTu+QrlU6D2dPAfubRmtJTi4K4YkQ5eXw==",
			
 
				+      "bin": {
			
 
				+        "prettier": "bin/prettier.cjs"
			
 
				+      },
			
 
				+      "engines": {
			
 
				+        "node": ">=14"
			
 
				+      },
			
 
				+      "funding": {
			
 
				+        "url": "https://github.com/prettier/prettier?sponsor=1"
			
 
				+      }
			
 
				+    },
			
 
				     "node_modules/proper-lockfile": {
			
 
				       "version": "4.1.2",
			
 
				       "license": "MIT",
			
--- a/package.json
+++ b/package.json
@@ -11,7 +11,8 @@
 
				     "crawlee": "^3.0.0",
			
 
				     "glob": "^10.3.10",
			
 
				     "inquirer": "^9.2.12",
			
 
				-    "playwright": "*"
			
 
				+    "playwright": "*",
			
 
				+    "prettier": "^3.1.0"
			
 
				   },
			
 
				   "devDependencies": {
			
 
				     "@apify/tsconfig": "^0.1.0",
			
@@ -26,7 +27,8 @@
 
				     "start:cli": "NODE_ENV=development npm run build && node dist/src/cli.js",
			
 
				     "start:dev": "NODE_ENV=development npm run build && node dist/src/main.js",
			
 
				     "start:prod": "node dist/main.js",
			
 
				-    "build": "tsc"
			
 
				+    "build": "tsc",
			
 
				+    "fmt": "prettier --write ."
			
 
				   },
			
 
				   "author": "It's not you it's me",
			
 
				   "license": "ISC"
			
--- a/src/cli.ts
+++ b/src/cli.ts
@@ -65,8 +65,7 @@ async function handler(options: Config) {
 
				         });
			
 
				       }
			
 
				 
			
 
				-      const answers = await inquirer
			
 
				-        .prompt(questions);
			
 
				+      const answers = await inquirer.prompt(questions);
			
 
				 
			
 
				       config = {
			
 
				         ...config,
			
@@ -81,9 +80,7 @@ async function handler(options: Config) {
 
				   }
			
 
				 }
			
 
				 
			
 
				-program
			
 
				-  .version(version)
			
 
				-  .description(description);
			
 
				+program.version(version).description(description);
			
 
				 
			
 
				 program
			
 
				   .option("-u, --url <string>", messages.url, "")
			
--- a/src/config.ts
+++ b/src/config.ts
@@ -0,0 +1,41 @@
 
				+import type { Page } from "playwright";
			
 
				+
			
 
				+export type Config = {
			
 
				+  /**
			
 
				+   * URL to start the crawl
			
 
				+   * @example "https://www.builder.io/c/docs/developers"
			
 
				+   * @default ""
			
 
				+   */
			
 
				+  url: string;
			
 
				+  /**
			
 
				+   * Pattern to match against for links on a page to subsequently crawl
			
 
				+   * @example "https://www.builder.io/c/docs/**"
			
 
				+   * @default ""
			
 
				+   */
			
 
				+  match: string | string[];
			
 
				+  /**
			
 
				+   * Selector to grab the inner text from
			
 
				+   * @example ".docs-builder-container"
			
 
				+   * @default ""
			
 
				+   */
			
 
				+  selector?: string;
			
 
				+  /**
			
 
				+   * Don't crawl more than this many pages
			
 
				+   * @default 50
			
 
				+   */
			
 
				+  maxPagesToCrawl: number;
			
 
				+  /**
			
 
				+   * File name for the finished data
			
 
				+   * @default "output.json"
			
 
				+   */
			
 
				+  outputFileName: string;
			
 
				+  /** Optional cookie to be set. E.g. for Cookie Consent */
			
 
				+  cookie?: { name: string; value: string };
			
 
				+  /** Optional function to run for each page found */
			
 
				+  onVisitPage?: (options: {
			
 
				+    page: Page;
			
 
				+    pushData: (data: any) => Promise<void>;
			
 
				+  }) => Promise<void>;
			
 
				+  /** Optional timeout for waiting for a selector to appear */
			
 
				+  waitForSelectorTimeout?: number;
			
 
				+};
			
--- a/src/core.ts
+++ b/src/core.ts
@@ -5,13 +5,19 @@ import { glob } from "glob";
 
				 import { Config } from "../config.js";
			
 
				 import { Page } from "playwright";
			
 
				 
			
 
				-let pageCounter = 0; 
			
 
				+let pageCounter = 0;
			
 
				 
			
 
				 export function getPageHtml(page: Page, selector: string) {
			
 
				   return page.evaluate((selector) => {
			
 
				     // Check if the selector is an XPath
			
 
				-    if (selector.startsWith('/')) {
			
 
				-      const elements = document.evaluate(selector, document, null, XPathResult.ANY_TYPE, null);
			
 
				+    if (selector.startsWith("/")) {
			
 
				+      const elements = document.evaluate(
			
 
				+        selector,
			
 
				+        document,
			
 
				+        null,
			
 
				+        XPathResult.ANY_TYPE,
			
 
				+        null,
			
 
				+      );
			
 
				       let result = elements.iterateNext();
			
 
				       return result ? result.textContent || "" : "";
			
 
				     } else {
			
@@ -23,10 +29,20 @@ export function getPageHtml(page: Page, selector: string) {
 
				 }
			
 
				 
			
 
				 export async function waitForXPath(page: Page, xpath: string, timeout: number) {
			
 
				-  await page.waitForFunction(xpath => {
			
 
				-    const elements = document.evaluate(xpath, document, null, XPathResult.ANY_TYPE, null);
			
 
				-    return elements.iterateNext() !== null;
			
 
				-  }, xpath, { timeout });
			
 
				+  await page.waitForFunction(
			
 
				+    (xpath) => {
			
 
				+      const elements = document.evaluate(
			
 
				+        xpath,
			
 
				+        document,
			
 
				+        null,
			
 
				+        XPathResult.ANY_TYPE,
			
 
				+        null,
			
 
				+      );
			
 
				+      return elements.iterateNext() !== null;
			
 
				+    },
			
 
				+    xpath,
			
 
				+    { timeout },
			
 
				+  );
			
 
				 }
			
 
				 
			
 
				 export async function crawl(config: Config) {
			
@@ -41,37 +57,44 @@ export async function crawl(config: Config) {
 
				           const cookie = {
			
 
				             name: config.cookie.name,
			
 
				             value: config.cookie.value,
			
 
				-            url: request.loadedUrl, 
			
 
				+            url: request.loadedUrl,
			
 
				           };
			
 
				           await page.context().addCookies([cookie]);
			
 
				         }
			
 
				-  
			
 
				+
			
 
				         const title = await page.title();
			
 
				         pageCounter++;
			
 
				-        log.info(`Crawling: Page ${pageCounter} / ${config.maxPagesToCrawl} - URL: ${request.loadedUrl}...`);
			
 
				-        
			
 
				+        log.info(
			
 
				+          `Crawling: Page ${pageCounter} / ${config.maxPagesToCrawl} - URL: ${request.loadedUrl}...`,
			
 
				+        );
			
 
				+
			
 
				         // Use custom handling for XPath selector
			
 
				-        if (config.selector.startsWith('/')) {
			
 
				-          await waitForXPath(page, config.selector, config.waitForSelectorTimeout ?? 1000);
			
 
				+        if (config.selector.startsWith("/")) {
			
 
				+          await waitForXPath(
			
 
				+            page,
			
 
				+            config.selector,
			
 
				+            config.waitForSelectorTimeout ?? 1000,
			
 
				+          );
			
 
				         } else {
			
 
				           await page.waitForSelector(config.selector, {
			
 
				             timeout: config.waitForSelectorTimeout ?? 1000,
			
 
				           });
			
 
				         }
			
 
				-  
			
 
				+
			
 
				         const html = await getPageHtml(page, config.selector);
			
 
				-  
			
 
				+
			
 
				         // Save results as JSON to ./storage/datasets/default
			
 
				         await pushData({ title, url: request.loadedUrl, html });
			
 
				-  
			
 
				+
			
 
				         if (config.onVisitPage) {
			
 
				           await config.onVisitPage({ page, pushData });
			
 
				         }
			
 
				-  
			
 
				+
			
 
				         // Extract links from the current page
			
 
				         // and add them to the crawling queue.
			
 
				         await enqueueLinks({
			
 
				-          globs: typeof config.match === "string" ? [config.match] : config.match,
			
 
				+          globs:
			
 
				+            typeof config.match === "string" ? [config.match] : config.match,
			
 
				         });
			
 
				       },
			
 
				       // Comment this option to scrape the full website.
			
@@ -79,22 +102,22 @@ export async function crawl(config: Config) {
 
				       // Uncomment this option to see the browser window.
			
 
				       // headless: false,
			
 
				     });
			
 
				-  
			
 
				+
			
 
				     // Add first URL to the queue and start the crawl.
			
 
				     await crawler.run([config.url]);
			
 
				-  }  
			
 
				+  }
			
 
				 }
			
 
				 
			
 
				 export async function write(config: Config) {
			
 
				   const jsonFiles = await glob("storage/datasets/default/*.json", {
			
 
				     absolute: true,
			
 
				   });
			
 
				-  
			
 
				+
			
 
				   const results = [];
			
 
				   for (const file of jsonFiles) {
			
 
				     const data = JSON.parse(await readFile(file, "utf-8"));
			
 
				     results.push(data);
			
 
				   }
			
 
				-  
			
 
				+
			
 
				   await writeFile(config.outputFileName, JSON.stringify(results, null, 2));
			
 
				 }