Browse Source

Merge pull request #38 from marcelovicentegc/main

Make GPT Crawler a CLI
Steve Sewell 1 year ago
parent
commit
4771ddbef8
10 changed files with 1206 additions and 1181 deletions
  1. 3 0
      .gitignore
  2. 1 1
      Dockerfile
  3. 48 23
      README.md
  4. 27 8
      config.ts
  5. 912 1051
      package-lock.json
  6. 9 1
      package.json
  7. 100 0
      src/cli.ts
  8. 100 0
      src/core.ts
  9. 4 96
      src/main.ts
  10. 2 1
      tsconfig.json

+ 3 - 0
.gitignore

@@ -6,3 +6,6 @@ node_modules
 apify_storage
 apify_storage
 crawlee_storage
 crawlee_storage
 storage
 storage
+
+# any output from the crawler
+.json

+ 1 - 1
Dockerfile

@@ -48,4 +48,4 @@ COPY --chown=myuser . ./
 
 
 # Run the image. If you know you won't need headful browsers,
 # Run the image. If you know you won't need headful browsers,
 # you can remove the XVFB start script for a micro perf gain.
 # you can remove the XVFB start script for a micro perf gain.
-CMD ./start_xvfb_and_run_cmd.sh && npm run start:prod --silent
+CMD ./start_xvfb_and_run_cmd.sh && npm run start:prod --silent

+ 48 - 23
README.md

@@ -1,51 +1,63 @@
-# GPT Crawler
+<!-- Markdown written with https://marketplace.visualstudio.com/items?itemName=yzhang.markdown-all-in-one -->
+
+# GPT Crawler <!-- omit from toc -->
 
 
 Crawl a site to generate knowledge files to create your own custom GPT from one or multiple URLs
 Crawl a site to generate knowledge files to create your own custom GPT from one or multiple URLs
 
 
 ![Gif showing the crawl run](https://github.com/BuilderIO/gpt-crawler/assets/844291/feb8763a-152b-4708-9c92-013b5c70d2f2)
 ![Gif showing the crawl run](https://github.com/BuilderIO/gpt-crawler/assets/844291/feb8763a-152b-4708-9c92-013b5c70d2f2)
 
 
+- [Example](#example)
+- [Get started](#get-started)
+  - [Running locally](#running-locally)
+    - [Clone the repository](#clone-the-repository)
+    - [Install dependencies](#install-dependencies)
+    - [Configure the crawler](#configure-the-crawler)
+    - [Run your crawler](#run-your-crawler)
+  - [Alternative methods](#alternative-methods)
+    - [Running in a container with Docker](#running-in-a-container-with-docker)
+    - [Running as a CLI](#running-as-a-cli)
+      - [Development](#development)
+  - [Upload your data to OpenAI](#upload-your-data-to-openai)
+    - [Create a custom GPT](#create-a-custom-gpt)
+    - [Create a custom assistant](#create-a-custom-assistant)
+- [Contributing](#contributing)
 
 
 ## Example
 ## Example
 
 
-[Here is a custom GPT](https://chat.openai.com/g/g-kywiqipmR-builder-io-assistant) that I quickly made to help answer questions about how to use and integrate [Builder.io](https://www.builder.io) by simply providing the URL to the Builder docs. 
+[Here is a custom GPT](https://chat.openai.com/g/g-kywiqipmR-builder-io-assistant) that I quickly made to help answer questions about how to use and integrate [Builder.io](https://www.builder.io) by simply providing the URL to the Builder docs.
 
 
 This project crawled the docs and generated the file that I uploaded as the basis for the custom GPT.
 This project crawled the docs and generated the file that I uploaded as the basis for the custom GPT.
 
 
-[Try it out yourself](https://chat.openai.com/g/g-kywiqipmR-builder-io-assistant) by asking questions about how to integrate Builder.io into a site. 
+[Try it out yourself](https://chat.openai.com/g/g-kywiqipmR-builder-io-assistant) by asking questions about how to integrate Builder.io into a site.
 
 
 > Note that you may need a paid ChatGPT plan to access this feature
 > Note that you may need a paid ChatGPT plan to access this feature
 
 
 ## Get started
 ## Get started
 
 
-### Prerequisites
+### Running locally
 
 
-Be sure you have Node.js >= 16 installed
+#### Clone the repository
 
 
-### Clone the repo
+Be sure you have Node.js >= 16 installed.
 
 
 ```sh
 ```sh
 git clone https://github.com/builderio/gpt-crawler
 git clone https://github.com/builderio/gpt-crawler
 ```
 ```
 
 
-### Install Dependencies
+#### Install dependencies
 
 
 ```sh
 ```sh
 npm i
 npm i
 ```
 ```
 
 
-If you do not have Playwright installed:
-```sh
-npx playwright install
-```
-
-### Configure the crawler
+#### Configure the crawler
 
 
 Open [config.ts](config.ts) and edit the `url` and `selectors` properties to match your needs.
 Open [config.ts](config.ts) and edit the `url` and `selectors` properties to match your needs.
 
 
 E.g. to crawl the Builder.io docs to make our custom GPT you can use:
 E.g. to crawl the Builder.io docs to make our custom GPT you can use:
 
 
 ```ts
 ```ts
-export const config: Config = {
+export const defaultConfig: Config = {
   url: "https://www.builder.io/c/docs/developers",
   url: "https://www.builder.io/c/docs/developers",
   match: "https://www.builder.io/c/docs/**",
   match: "https://www.builder.io/c/docs/**",
   selector: `.docs-builder-container`,
   selector: `.docs-builder-container`,
@@ -69,23 +81,41 @@ type Config = {
   /** File name for the finished data */
   /** File name for the finished data */
   outputFileName: string;
   outputFileName: string;
   /** Optional cookie to be set. E.g. for Cookie Consent */
   /** Optional cookie to be set. E.g. for Cookie Consent */
-  cookie?: {name: string; value: string}
+  cookie?: { name: string; value: string };
   /** Optional function to run for each page found */
   /** Optional function to run for each page found */
   onVisitPage?: (options: {
   onVisitPage?: (options: {
     page: Page;
     page: Page;
     pushData: (data: any) => Promise<void>;
     pushData: (data: any) => Promise<void>;
   }) => Promise<void>;
   }) => Promise<void>;
-    /** Optional timeout for waiting for a selector to appear */
-    waitForSelectorTimeout?: number;
+  /** Optional timeout for waiting for a selector to appear */
+  waitForSelectorTimeout?: number;
 };
 };
 ```
 ```
 
 
-### Run your crawler
+#### Run your crawler
 
 
 ```sh
 ```sh
 npm start
 npm start
 ```
 ```
 
 
+### Alternative methods
+
+#### [Running in a container with Docker](./containerapp/README.md)
+
+To obtain the `output.json` with a containerized execution. Go into the `containerapp` directory. Modify the `config.ts` same as above, the `output.json`file should be generated in the data folder. Note : the `outputFileName` property in the `config.ts` file in containerapp folder is configured to work with the container.
+
+#### Running as a CLI
+
+<!-- TODO: Needs to be actually published -->
+
+##### Development
+
+To run the CLI locally while developing it:
+  
+```sh
+npm run start:cli --url https://www.builder.io/c/docs/developers --match https://www.builder.io/c/docs/** --selector .docs-builder-container --maxPagesToCrawl 50 --outputFileName output.json
+```
+
 ### Upload your data to OpenAI
 ### Upload your data to OpenAI
 
 
 The crawl will generate a file called `output.json` at the root of this project. Upload that [to OpenAI](https://platform.openai.com/docs/assistants/overview) to create your custom assistant or custom GPT.
 The crawl will generate a file called `output.json` at the root of this project. Upload that [to OpenAI](https://platform.openai.com/docs/assistants/overview) to create your custom assistant or custom GPT.
@@ -105,7 +135,6 @@ Use this option for UI access to your generated knowledge that you can easily sh
 
 
 ![Gif of how to upload a custom GPT](https://github.com/BuilderIO/gpt-crawler/assets/844291/22f27fb5-6ca5-4748-9edd-6bcf00b408cf)
 ![Gif of how to upload a custom GPT](https://github.com/BuilderIO/gpt-crawler/assets/844291/22f27fb5-6ca5-4748-9edd-6bcf00b408cf)
 
 
-
 #### Create a custom assistant
 #### Create a custom assistant
 
 
 Use this option for API access to your generated knowledge that you can integrate into your product.
 Use this option for API access to your generated knowledge that you can integrate into your product.
@@ -116,10 +145,6 @@ Use this option for API access to your generated knowledge that you can integrat
 
 
 ![Gif of how to upload to an assistant](https://github.com/BuilderIO/gpt-crawler/assets/844291/06e6ad36-e2ba-4c6e-8d5a-bf329140de49)
 ![Gif of how to upload to an assistant](https://github.com/BuilderIO/gpt-crawler/assets/844291/06e6ad36-e2ba-4c6e-8d5a-bf329140de49)
 
 
-## (Alternate method) Running in a container with Docker
-To obtain the `output.json` with a containerized execution. Go into the `containerapp` directory. Modify the `config.ts` same as above, the `output.json`file should be generated in the data folder. Note : the `outputFileName` property in the `config.ts` file in containerapp folder is configured to work with the container. 
-
-
 ## Contributing
 ## Contributing
 
 
 Know how to make this project better? Send a PR!
 Know how to make this project better? Send a PR!

+ 27 - 8
config.ts

@@ -1,14 +1,33 @@
 import { Page } from "playwright";
 import { Page } from "playwright";
-type Config = {
-  /** URL to start the crawl */
+
+export type Config = {
+  /**
+   * URL to start the crawl
+   * @example "https://www.builder.io/c/docs/developers"
+   * @default ""
+   */
   url: string;
   url: string;
-  /** Pattern to match against for links on a page to subsequently crawl */
+  /**
+   * Pattern to match against for links on a page to subsequently crawl
+   * @example "https://www.builder.io/c/docs/**"
+   * @default ""
+   */
   match: string | string[];
   match: string | string[];
-  /** Selector to grab the inner text from */
+  /**
+   * Selector to grab the inner text from
+   * @example ".docs-builder-container"
+   * @default ""
+   */
   selector: string;
   selector: string;
-  /** Don't crawl more than this many pages */
+  /**
+   * Don't crawl more than this many pages
+   * @default 50
+   */
   maxPagesToCrawl: number;
   maxPagesToCrawl: number;
-  /** File name for the finished data */
+  /**
+   * File name for the finished data
+   * @default "output.json"
+   */
   outputFileName: string;
   outputFileName: string;
   /** Optional cookie to be set. E.g. for Cookie Consent */
   /** Optional cookie to be set. E.g. for Cookie Consent */
   cookie?: { name: string; value: string };
   cookie?: { name: string; value: string };
@@ -21,10 +40,10 @@ type Config = {
   waitForSelectorTimeout?: number;
   waitForSelectorTimeout?: number;
 };
 };
 
 
-export const config: Config = {
+export const defaultConfig: Config = {
   url: "https://www.builder.io/c/docs/developers",
   url: "https://www.builder.io/c/docs/developers",
   match: "https://www.builder.io/c/docs/**",
   match: "https://www.builder.io/c/docs/**",
   selector: `.docs-builder-container`,
   selector: `.docs-builder-container`,
   maxPagesToCrawl: 50,
   maxPagesToCrawl: 50,
-  outputFileName: "output.json",
+  outputFileName: "../output.json",
 };
 };

File diff suppressed because it is too large
+ 912 - 1051
package-lock.json


+ 9 - 1
package.json

@@ -2,22 +2,30 @@
   "name": "@builder.io/gpt-crawler",
   "name": "@builder.io/gpt-crawler",
   "version": "0.0.1",
   "version": "0.0.1",
   "type": "module",
   "type": "module",
+  "bin": {
+    "gpt-crawler": "./dist/src/cli.js"
+  },
   "description": "Crawl a site to generate knowledge files to create your own custom GPT",
   "description": "Crawl a site to generate knowledge files to create your own custom GPT",
   "dependencies": {
   "dependencies": {
+    "commander": "^11.1.0",
     "crawlee": "^3.0.0",
     "crawlee": "^3.0.0",
     "glob": "^10.3.10",
     "glob": "^10.3.10",
+    "inquirer": "^9.2.12",
     "playwright": "*"
     "playwright": "*"
   },
   },
   "devDependencies": {
   "devDependencies": {
     "@apify/tsconfig": "^0.1.0",
     "@apify/tsconfig": "^0.1.0",
+    "@types/inquirer": "^9.0.7",
     "@types/node": "^20.0.0",
     "@types/node": "^20.0.0",
     "ts-node": "^10.8.0",
     "ts-node": "^10.8.0",
     "typescript": "^5.0.0"
     "typescript": "^5.0.0"
   },
   },
   "scripts": {
   "scripts": {
+    "preinstall": "npx playwright install",
     "start": "npm run start:dev",
     "start": "npm run start:dev",
+    "start:cli": "NODE_ENV=development npm run build && node dist/src/cli.js",
+    "start:dev": "NODE_ENV=development npm run build && node dist/src/main.js",
     "start:prod": "node dist/main.js",
     "start:prod": "node dist/main.js",
-    "start:dev": "node --no-warnings=ExperimentalWarning --loader ts-node/esm/transpile-only src/main.ts",
     "build": "tsc"
     "build": "tsc"
   },
   },
   "author": "It's not you it's me",
   "author": "It's not you it's me",

+ 100 - 0
src/cli.ts

@@ -0,0 +1,100 @@
+#!/usr/bin/env node
+
+import { program } from "commander";
+import { Config } from "../config.js";
+import { crawl, write } from "./core.js";
+import { createRequire } from "node:module";
+import inquirer from "inquirer";
+
+const require = createRequire(import.meta.url);
+const { version, description } = require("../../package.json");
+
+const messages = {
+  url: "What is the first URL of the website you want to crawl?",
+  match: "What is the URL pattern you want to match?",
+  selector: "What is the CSS selector you want to match?",
+  maxPagesToCrawl: "How many pages do you want to crawl?",
+  outputFileName: "What is the name of the output file?",
+};
+
+async function handler(options: Config) {
+  try {
+    const {
+      url,
+      match,
+      selector,
+      maxPagesToCrawl: maxPagesToCrawlStr,
+      outputFileName,
+    } = options;
+
+    // @ts-ignore
+    const maxPagesToCrawl = parseInt(maxPagesToCrawlStr, 10);
+
+    let config: Config = {
+      url,
+      match,
+      selector,
+      maxPagesToCrawl,
+      outputFileName,
+    };
+
+    if (!config.url || !config.match || !config.selector) {
+      const questions = [];
+
+      if (!config.url) {
+        questions.push({
+          type: "input",
+          name: "url",
+          message: messages.url,
+        });
+      }
+
+      if (!config.match) {
+        questions.push({
+          type: "input",
+          name: "match",
+          message: messages.match,
+        });
+      }
+
+      if (!config.selector) {
+        questions.push({
+          type: "input",
+          name: "selector",
+          message: messages.selector,
+        });
+      }
+
+      const answers = await inquirer
+        .prompt(questions);
+
+      config = {
+        ...config,
+        ...answers,
+      };
+    }
+
+    await crawl(config);
+    await write(config);
+  } catch (error) {
+    console.log(error);
+  }
+}
+
+program
+  .version(version)
+  .description(description);
+
+program
+  .option("-u, --url <string>", messages.url, "")
+  .option("-m, --match <string>", messages.match, "")
+  .option("-s, --selector <string>", messages.selector, "")
+  .option("-m, --maxPagesToCrawl <number>", messages.maxPagesToCrawl, "50")
+  .option(
+    "-o, --outputFileName <string>",
+    messages.outputFileName,
+    "output.json",
+  )
+  .action(handler);
+
+program.parse();

+ 100 - 0
src/core.ts

@@ -0,0 +1,100 @@
+// For more information, see https://crawlee.dev/
+import { PlaywrightCrawler } from "crawlee";
+import { readFile, writeFile } from "fs/promises";
+import { glob } from "glob";
+import { Config } from "../config.js";
+import { Page } from "playwright";
+
+let pageCounter = 0; 
+
+export function getPageHtml(page: Page, selector: string) {
+  return page.evaluate((selector) => {
+    // Check if the selector is an XPath
+    if (selector.startsWith('/')) {
+      const elements = document.evaluate(selector, document, null, XPathResult.ANY_TYPE, null);
+      let result = elements.iterateNext();
+      return result ? result.textContent || "" : "";
+    } else {
+      // Handle as a CSS selector
+      const el = document.querySelector(selector) as HTMLElement | null;
+      return el?.innerText || "";
+    }
+  }, selector);
+}
+
+export async function waitForXPath(page: Page, xpath: string, timeout: number) {
+  await page.waitForFunction(xpath => {
+    const elements = document.evaluate(xpath, document, null, XPathResult.ANY_TYPE, null);
+    return elements.iterateNext() !== null;
+  }, xpath, { timeout });
+}
+
+export async function crawl(config: Config) {
+  if (process.env.NO_CRAWL !== "true") {
+    // PlaywrightCrawler crawls the web using a headless
+    // browser controlled by the Playwright library.
+    const crawler = new PlaywrightCrawler({
+      // Use the requestHandler to process each of the crawled pages.
+      async requestHandler({ request, page, enqueueLinks, log, pushData }) {
+        if (config.cookie) {
+          // Set the cookie for the specific URL
+          const cookie = {
+            name: config.cookie.name,
+            value: config.cookie.value,
+            url: request.loadedUrl, 
+          };
+          await page.context().addCookies([cookie]);
+        }
+  
+        const title = await page.title();
+        pageCounter++;
+        log.info(`Crawling: Page ${pageCounter} / ${config.maxPagesToCrawl} - URL: ${request.loadedUrl}...`);
+        
+        // Use custom handling for XPath selector
+        if (config.selector.startsWith('/')) {
+          await waitForXPath(page, config.selector, config.waitForSelectorTimeout ?? 1000);
+        } else {
+          await page.waitForSelector(config.selector, {
+            timeout: config.waitForSelectorTimeout ?? 1000,
+          });
+        }
+  
+        const html = await getPageHtml(page, config.selector);
+  
+        // Save results as JSON to ./storage/datasets/default
+        await pushData({ title, url: request.loadedUrl, html });
+  
+        if (config.onVisitPage) {
+          await config.onVisitPage({ page, pushData });
+        }
+  
+        // Extract links from the current page
+        // and add them to the crawling queue.
+        await enqueueLinks({
+          globs: typeof config.match === "string" ? [config.match] : config.match,
+        });
+      },
+      // Comment this option to scrape the full website.
+      maxRequestsPerCrawl: config.maxPagesToCrawl,
+      // Uncomment this option to see the browser window.
+      // headless: false,
+    });
+  
+    // Add first URL to the queue and start the crawl.
+    await crawler.run([config.url]);
+  }  
+}
+
+export async function write(config: Config) {
+  const jsonFiles = await glob("storage/datasets/default/*.json", {
+    absolute: true,
+  });
+  
+  const results = [];
+  for (const file of jsonFiles) {
+    const data = JSON.parse(await readFile(file, "utf-8"));
+    results.push(data);
+  }
+  
+  await writeFile(config.outputFileName, JSON.stringify(results, null, 2));
+}

+ 4 - 96
src/main.ts

@@ -1,97 +1,5 @@
-// For more information, see https://crawlee.dev/
-import { PlaywrightCrawler } from "crawlee";
-import { readFile, writeFile } from "fs/promises";
-import { glob } from "glob";
-import { config } from "../config.js";
-import { Page } from "playwright";
+import { defaultConfig } from "../config.js";
+import { crawl, write } from "./core.js";
 
 
-let pageCounter = 0; 
-
-export function getPageHtml(page: Page, selector: string) {
-  return page.evaluate((selector) => {
-    // Check if the selector is an XPath
-    if (selector.startsWith('/')) {
-      const elements = document.evaluate(selector, document, null, XPathResult.ANY_TYPE, null);
-      let result = elements.iterateNext();
-      return result ? result.textContent || "" : "";
-    } else {
-      // Handle as a CSS selector
-      const el = document.querySelector(selector) as HTMLElement | null;
-      return el?.innerText || "";
-    }
-  }, selector);
-}
-
-export async function waitForXPath(page: Page, xpath: string, timeout: number) {
-  await page.waitForFunction(xpath => {
-    const elements = document.evaluate(xpath, document, null, XPathResult.ANY_TYPE, null);
-    return elements.iterateNext() !== null;
-  }, xpath, { timeout });
-}
-
-if (process.env.NO_CRAWL !== "true") {
-  // PlaywrightCrawler crawls the web using a headless
-  // browser controlled by the Playwright library.
-  const crawler = new PlaywrightCrawler({
-    // Use the requestHandler to process each of the crawled pages.
-    async requestHandler({ request, page, enqueueLinks, log, pushData }) {
-
-      if (config.cookie) {
-        // Set the cookie for the specific URL
-        const cookie = {
-          name: config.cookie.name,
-          value: config.cookie.value,
-          url: request.loadedUrl, 
-        };
-        await page.context().addCookies([cookie]);
-      }
-
-      const title = await page.title();
-      pageCounter++;
-      log.info(`Crawling: Page ${pageCounter} / ${config.maxPagesToCrawl} - URL: ${request.loadedUrl}...`);
-      
-      // Use custom handling for XPath selector
-      if (config.selector.startsWith('/')) {
-        await waitForXPath(page, config.selector, config.waitForSelectorTimeout ?? 1000);
-      } else {
-        await page.waitForSelector(config.selector, {
-          timeout: config.waitForSelectorTimeout ?? 1000,
-        });
-      }
-
-      const html = await getPageHtml(page, config.selector);
-
-      // Save results as JSON to ./storage/datasets/default
-      await pushData({ title, url: request.loadedUrl, html });
-
-      if (config.onVisitPage) {
-        await config.onVisitPage({ page, pushData });
-      }
-
-      // Extract links from the current page
-      // and add them to the crawling queue.
-      await enqueueLinks({
-        globs: typeof config.match === "string" ? [config.match] : config.match,
-      });
-    },
-    // Comment this option to scrape the full website.
-    maxRequestsPerCrawl: config.maxPagesToCrawl,
-    // Uncomment this option to see the browser window.
-    // headless: false,
-  });
-
-  // Add first URL to the queue and start the crawl.
-  await crawler.run([config.url]);
-}
-
-const jsonFiles = await glob("storage/datasets/default/*.json", {
-  absolute: true,
-});
-
-const results = [];
-for (const file of jsonFiles) {
-  const data = JSON.parse(await readFile(file, "utf-8"));
-  results.push(data);
-}
-
-await writeFile(config.outputFileName, JSON.stringify(results, null, 2));
+await crawl(defaultConfig);
+await write(defaultConfig);

+ 2 - 1
tsconfig.json

@@ -6,7 +6,8 @@
     "outDir": "dist",
     "outDir": "dist",
     "resolveJsonModule": true,
     "resolveJsonModule": true,
     "noUnusedLocals": false,
     "noUnusedLocals": false,
+    "skipLibCheck": true,
     "lib": ["DOM"]
     "lib": ["DOM"]
   },
   },
-  "include": ["./src/**/*", "./config.ts"]
+  "include": ["./src/**/*", "config.ts"]
 }
 }