2 лет назад · 61fbb62273
--- a/README.md
+++ b/README.md
@@ -1,5 +1,3 @@
 
																-<!-- Markdown written with https://marketplace.visualstudio.com/items?itemName=yzhang.markdown-all-in-one -->
															
 
																-
															
 
																 # GPT Crawler <!-- omit from toc -->
															
 
																 Crawl a site to generate knowledge files to create your own custom GPT from one or multiple URLs
															
@@ -66,7 +64,7 @@ export const defaultConfig: Config = {
 
																 };
															
 
																 ```
															
 
																-See the top of the file for the type definition for what you can configure:
															
 
																+See [config.ts](src/config.ts) for all available options. Here is a sample of the common configu options:
															
 
																 ```ts
															
 
																 type Config = {
															
@@ -80,15 +78,6 @@ type Config = {
 
																   maxPagesToCrawl: number;
															
 
																   /** File name for the finished data */
															
 
																   outputFileName: string;
															
 
																-  /** Optional cookie to be set. E.g. for Cookie Consent */
															
 
																-  cookie?: { name: string; value: string };
															
 
																-  /** Optional function to run for each page found */
															
 
																-  onVisitPage?: (options: {
															
 
																-    page: Page;
															
 
																-    pushData: (data: any) => Promise<void>;
															
 
																-  }) => Promise<void>;
															
 
																-  /** Optional timeout for waiting for a selector to appear */
															
 
																-  waitForSelectorTimeout?: number;
															
 
																 };
															
 
																 ```
															
@@ -104,18 +93,6 @@ npm start
 
																 To obtain the `output.json` with a containerized execution. Go into the `containerapp` directory. Modify the `config.ts` same as above, the `output.json`file should be generated in the data folder. Note : the `outputFileName` property in the `config.ts` file in containerapp folder is configured to work with the container.
															
 
																-#### Running as a CLI
															
 
																-
															
 
																-<!-- TODO: Needs to be actually published -->
															
 
																-
															
 
																-##### Development
															
 
																-
															
 
																-To run the CLI locally while developing it:
															
 
																-  
															
 
																-```sh
															
 
																-npm run start:cli --url https://www.builder.io/c/docs/developers --match https://www.builder.io/c/docs/** --selector .docs-builder-container --maxPagesToCrawl 50 --outputFileName output.json
															
 
																-```
															
 
																-
															
 
																 ### Upload your data to OpenAI
															
 
																 The crawl will generate a file called `output.json` at the root of this project. Upload that [to OpenAI](https://platform.openai.com/docs/assistants/overview) to create your custom assistant or custom GPT.
															
--- a/config.ts
+++ b/config.ts
@@ -1,49 +1,8 @@
 
																-import { Page } from "playwright";
															
 
																-
															
 
																-export type Config = {
															
 
																-  /**
															
 
																-   * URL to start the crawl
															
 
																-   * @example "https://www.builder.io/c/docs/developers"
															
 
																-   * @default ""
															
 
																-   */
															
 
																-  url: string;
															
 
																-  /**
															
 
																-   * Pattern to match against for links on a page to subsequently crawl
															
 
																-   * @example "https://www.builder.io/c/docs/**"
															
 
																-   * @default ""
															
 
																-   */
															
 
																-  match: string | string[];
															
 
																-  /**
															
 
																-   * Selector to grab the inner text from
															
 
																-   * @example ".docs-builder-container"
															
 
																-   * @default ""
															
 
																-   */
															
 
																-  selector: string;
															
 
																-  /**
															
 
																-   * Don't crawl more than this many pages
															
 
																-   * @default 50
															
 
																-   */
															
 
																-  maxPagesToCrawl: number;
															
 
																-  /**
															
 
																-   * File name for the finished data
															
 
																-   * @default "output.json"
															
 
																-   */
															
 
																-  outputFileName: string;
															
 
																-  /** Optional cookie to be set. E.g. for Cookie Consent */
															
 
																-  cookie?: { name: string; value: string };
															
 
																-  /** Optional function to run for each page found */
															
 
																-  onVisitPage?: (options: {
															
 
																-    page: Page;
															
 
																-    pushData: (data: any) => Promise<void>;
															
 
																-  }) => Promise<void>;
															
 
																-  /** Optional timeout for waiting for a selector to appear */
															
 
																-  waitForSelectorTimeout?: number;
															
 
																-};
															
 
																+import { Config } from "./src/config";
															
 
																 export const defaultConfig: Config = {
															
 
																   url: "https://www.builder.io/c/docs/developers",
															
 
																   match: "https://www.builder.io/c/docs/**",
															
 
																-  selector: `.docs-builder-container`,
															
 
																   maxPagesToCrawl: 50,
															
 
																-  outputFileName: "../output.json",
															
 
																+  outputFileName: "output.json",
															
 
																 };
															
--- a/containerapp/README.md
+++ b/containerapp/README.md
@@ -1,15 +1,14 @@
 
																 # Containerized crawler
															
 
																-## Docker image with packaged crawler, with script for building and execution.
															
 
																+## Docker image with packaged crawler, with script for building and execution.
															
 
																 All dependencies set up and configured in the Dockerfile. Requires docker to be installed.
															
 
																-
															
 
																 ## Get started
															
 
																 ### Prerequisites
															
 
																 Be sure you have docker installed
															
 
																-1. ``` cd gpt-crawler/containerapp  ```
															
 
																-2. ``` . ./run.sh  ```
															
 
																+1. `cd gpt-crawler/containerapp `
															
 
																+2. `. ./run.sh `
															
--- a/containerapp/data/config.ts
+++ b/containerapp/data/config.ts
@@ -12,13 +12,13 @@ type Config = {
 
																   /** File name for the finished data */
															
 
																   outputFileName: string;
															
 
																   /** Optional cookie to be set. E.g. for Cookie Consent */
															
 
																-  cookie?: {name: string; value: string}
															
 
																+  cookie?: { name: string; value: string };
															
 
																   /** Optional function to run for each page found */
															
 
																   onVisitPage?: (options: {
															
 
																     page: Page;
															
 
																     pushData: (data: any) => Promise<void>;
															
 
																   }) => Promise<void>;
															
 
																-    /** Optional timeout for waiting for a selector to appear */
															
 
																+  /** Optional timeout for waiting for a selector to appear */
															
 
																   waitForSelectorTimeout?: number;
															
 
																 };
															
--- a/package-lock.json
+++ b/package-lock.json
@@ -14,7 +14,8 @@
 
																         "crawlee": "^3.0.0",
															
 
																         "glob": "^10.3.10",
															
 
																         "inquirer": "^9.2.12",
															
 
																-        "playwright": "*"
															
 
																+        "playwright": "*",
															
 
																+        "prettier": "^3.1.0"
															
 
																       },
															
 
																       "bin": {
															
 
																         "gpt-crawler": "dist/src/cli.js"
															
@@ -2761,6 +2762,20 @@
 
																         "node": ">=16"
															
 
																       }
															
 
																     },
															
 
																+    "node_modules/prettier": {
															
 
																+      "version": "3.1.0",
															
 
																+      "resolved": "https://registry.npmjs.org/prettier/-/prettier-3.1.0.tgz",
															
 
																+      "integrity": "sha512-TQLvXjq5IAibjh8EpBIkNKxO749UEWABoiIZehEPiY4GNpVdhaFKqSTu+QrlU6D2dPAfubRmtJTi4K4YkQ5eXw==",
															
 
																+      "bin": {
															
 
																+        "prettier": "bin/prettier.cjs"
															
 
																+      },
															
 
																+      "engines": {
															
 
																+        "node": ">=14"
															
 
																+      },
															
 
																+      "funding": {
															
 
																+        "url": "https://github.com/prettier/prettier?sponsor=1"
															
 
																+      }
															
 
																+    },
															
 
																     "node_modules/proper-lockfile": {
															
 
																       "version": "4.1.2",
															
 
																       "license": "MIT",
															
--- a/package.json
+++ b/package.json
@@ -11,7 +11,8 @@
 
																     "crawlee": "^3.0.0",
															
 
																     "glob": "^10.3.10",
															
 
																     "inquirer": "^9.2.12",
															
 
																-    "playwright": "*"
															
 
																+    "playwright": "*",
															
 
																+    "prettier": "^3.1.0"
															
 
																   },
															
 
																   "devDependencies": {
															
 
																     "@apify/tsconfig": "^0.1.0",
															
@@ -26,7 +27,8 @@
 
																     "start:cli": "NODE_ENV=development npm run build && node dist/src/cli.js",
															
 
																     "start:dev": "NODE_ENV=development npm run build && node dist/src/main.js",
															
 
																     "start:prod": "node dist/main.js",
															
 
																-    "build": "tsc"
															
 
																+    "build": "tsc",
															
 
																+    "fmt": "prettier --write ."
															
 
																   },
															
 
																   "author": "It's not you it's me",
															
 
																   "license": "ISC"
															
--- a/src/cli.ts
+++ b/src/cli.ts
@@ -65,8 +65,7 @@ async function handler(options: Config) {
 
																         });
															
 
																       }
															
 
																-      const answers = await inquirer
															
 
																-        .prompt(questions);
															
 
																+      const answers = await inquirer.prompt(questions);
															
 
																       config = {
															
 
																         ...config,
															
@@ -81,9 +80,7 @@ async function handler(options: Config) {
 
																   }
															
 
																 }
															
 
																-program
															
 
																-  .version(version)
															
 
																-  .description(description);
															
 
																+program.version(version).description(description);
															
 
																 program
															
 
																   .option("-u, --url <string>", messages.url, "")
															
--- a/src/config.ts
+++ b/src/config.ts
@@ -0,0 +1,41 @@
 
																+import type { Page } from "playwright";
															
 
																+
															
 
																+export type Config = {
															
 
																+  /**
															
 
																+   * URL to start the crawl
															
 
																+   * @example "https://www.builder.io/c/docs/developers"
															
 
																+   * @default ""
															
 
																+   */
															
 
																+  url: string;
															
 
																+  /**
															
 
																+   * Pattern to match against for links on a page to subsequently crawl
															
 
																+   * @example "https://www.builder.io/c/docs/**"
															
 
																+   * @default ""
															
 
																+   */
															
 
																+  match: string | string[];
															
 
																+  /**
															
 
																+   * Selector to grab the inner text from
															
 
																+   * @example ".docs-builder-container"
															
 
																+   * @default ""
															
 
																+   */
															
 
																+  selector?: string;
															
 
																+  /**
															
 
																+   * Don't crawl more than this many pages
															
 
																+   * @default 50
															
 
																+   */
															
 
																+  maxPagesToCrawl: number;
															
 
																+  /**
															
 
																+   * File name for the finished data
															
 
																+   * @default "output.json"
															
 
																+   */
															
 
																+  outputFileName: string;
															
 
																+  /** Optional cookie to be set. E.g. for Cookie Consent */
															
 
																+  cookie?: { name: string; value: string };
															
 
																+  /** Optional function to run for each page found */
															
 
																+  onVisitPage?: (options: {
															
 
																+    page: Page;
															
 
																+    pushData: (data: any) => Promise<void>;
															
 
																+  }) => Promise<void>;
															
 
																+  /** Optional timeout for waiting for a selector to appear */
															
 
																+  waitForSelectorTimeout?: number;
															
 
																+};
															
--- a/src/core.ts
+++ b/src/core.ts
@@ -5,13 +5,19 @@ import { glob } from "glob";
 
																 import { Config } from "../config.js";
															
 
																 import { Page } from "playwright";
															
 
																-let pageCounter = 0; 
															
 
																+let pageCounter = 0;
															
 
																 export function getPageHtml(page: Page, selector: string) {
															
 
																   return page.evaluate((selector) => {
															
 
																     // Check if the selector is an XPath
															
 
																-    if (selector.startsWith('/')) {
															
 
																-      const elements = document.evaluate(selector, document, null, XPathResult.ANY_TYPE, null);
															
 
																+    if (selector.startsWith("/")) {
															
 
																+      const elements = document.evaluate(
															
 
																+        selector,
															
 
																+        document,
															
 
																+        null,
															
 
																+        XPathResult.ANY_TYPE,
															
 
																+        null,
															
 
																+      );
															
 
																       let result = elements.iterateNext();
															
 
																       return result ? result.textContent || "" : "";
															
 
																     } else {
															
@@ -23,10 +29,20 @@ export function getPageHtml(page: Page, selector: string) {
 
																 }
															
 
																 export async function waitForXPath(page: Page, xpath: string, timeout: number) {
															
 
																-  await page.waitForFunction(xpath => {
															
 
																-    const elements = document.evaluate(xpath, document, null, XPathResult.ANY_TYPE, null);
															
 
																-    return elements.iterateNext() !== null;
															
 
																-  }, xpath, { timeout });
															
 
																+  await page.waitForFunction(
															
 
																+    (xpath) => {
															
 
																+      const elements = document.evaluate(
															
 
																+        xpath,
															
 
																+        document,
															
 
																+        null,
															
 
																+        XPathResult.ANY_TYPE,
															
 
																+        null,
															
 
																+      );
															
 
																+      return elements.iterateNext() !== null;
															
 
																+    },
															
 
																+    xpath,
															
 
																+    { timeout },
															
 
																+  );
															
 
																 }
															
 
																 export async function crawl(config: Config) {
															
@@ -41,37 +57,44 @@ export async function crawl(config: Config) {
 
																           const cookie = {
															
 
																             name: config.cookie.name,
															
 
																             value: config.cookie.value,
															
 
																-            url: request.loadedUrl, 
															
 
																+            url: request.loadedUrl,
															
 
																           };
															
 
																           await page.context().addCookies([cookie]);
															
 
																         }
															
 
																-  
															
 
																+
															
 
																         const title = await page.title();
															
 
																         pageCounter++;
															
 
																-        log.info(`Crawling: Page ${pageCounter} / ${config.maxPagesToCrawl} - URL: ${request.loadedUrl}...`);
															
 
																-        
															
 
																+        log.info(
															
 
																+          `Crawling: Page ${pageCounter} / ${config.maxPagesToCrawl} - URL: ${request.loadedUrl}...`,
															
 
																+        );
															
 
																+
															
 
																         // Use custom handling for XPath selector
															
 
																-        if (config.selector.startsWith('/')) {
															
 
																-          await waitForXPath(page, config.selector, config.waitForSelectorTimeout ?? 1000);
															
 
																+        if (config.selector.startsWith("/")) {
															
 
																+          await waitForXPath(
															
 
																+            page,
															
 
																+            config.selector,
															
 
																+            config.waitForSelectorTimeout ?? 1000,
															
 
																+          );
															
 
																         } else {
															
 
																           await page.waitForSelector(config.selector, {
															
 
																             timeout: config.waitForSelectorTimeout ?? 1000,
															
 
																           });
															
 
																         }
															
 
																-  
															
 
																+
															
 
																         const html = await getPageHtml(page, config.selector);
															
 
																-  
															
 
																+
															
 
																         // Save results as JSON to ./storage/datasets/default
															
 
																         await pushData({ title, url: request.loadedUrl, html });
															
 
																-  
															
 
																+
															
 
																         if (config.onVisitPage) {
															
 
																           await config.onVisitPage({ page, pushData });
															
 
																         }
															
 
																-  
															
 
																+
															
 
																         // Extract links from the current page
															
 
																         // and add them to the crawling queue.
															
 
																         await enqueueLinks({
															
 
																-          globs: typeof config.match === "string" ? [config.match] : config.match,
															
 
																+          globs:
															
 
																+            typeof config.match === "string" ? [config.match] : config.match,
															
 
																         });
															
 
																       },
															
 
																       // Comment this option to scrape the full website.
															
@@ -79,22 +102,22 @@ export async function crawl(config: Config) {
 
																       // Uncomment this option to see the browser window.
															
 
																       // headless: false,
															
 
																     });
															
 
																-  
															
 
																+
															
 
																     // Add first URL to the queue and start the crawl.
															
 
																     await crawler.run([config.url]);
															
 
																-  }  
															
 
																+  }
															
 
																 }
															
 
																 export async function write(config: Config) {
															
 
																   const jsonFiles = await glob("storage/datasets/default/*.json", {
															
 
																     absolute: true,
															
 
																   });
															
 
																-  
															
 
																+
															
 
																   const results = [];
															
 
																   for (const file of jsonFiles) {
															
 
																     const data = JSON.parse(await readFile(file, "utf-8"));
															
 
																     results.push(data);
															
 
																   }
															
 
																-  
															
 
																+
															
 
																   await writeFile(config.outputFileName, JSON.stringify(results, null, 2));
															
 
																 }