Steve Sewell 1 rok pred
rodič
commit
e07b50d022
7 zmenil súbory, kde vykonal 134 pridanie a 5434 odobranie
  1. 76 6
      README.md
  2. 18 6
      config.ts
  3. 0 7
      forum.json
  4. 0 3987
      github.json
  5. 0 1397
      output.json
  6. 23 24
      package.json
  7. 17 7
      src/main.ts

+ 76 - 6
README.md

@@ -1,9 +1,79 @@
-# Getting started with Crawlee
+# GPT Crawler
 
-This example uses `PlaywrightCrawler` to recursively crawl https://crawlee.dev using the browser automation library [Playwright](https://playwright.dev).
+Crawl a site to generate knowledge files to create your own custom GPT
 
-You can find more examples and documentation at the following links:
+## Get started
 
-- [Step-by-step tutorial](https://crawlee.dev/docs/introduction) for Crawlee
-- `PlaywrightCrawler` [API documentation](https://crawlee.dev/api/playwright-crawler/class/PlaywrightCrawler)
-- Other [examples](https://crawlee.dev/docs/examples/playwright-crawler)
+### Prerequisites
+
+Be sure you have Node.js >= 16 installed
+
+### Clone the repo
+
+```sh
+git clone https://github.com/bridgeproject/gpt-crawler
+```
+
+### Configure the crawler
+
+Open [config.ts](config.ts) and edit the `url` and `selectors` properties to match your needs.
+
+E.g. to crawl the Builder.io docs to make our custom GPT you can use:
+
+```ts
+export const config: Config = {
+  url: "https://www.builder.io/c/docs/developers",
+  match: "https://www.builder.io/c/docs/**",
+  selector: `.docs-builder-container`,
+  maxPagesToCrawl: 1000,
+  outputFileName: "output.json",
+};
+```
+
+See the top of the file for the type definition for what you can configure:
+
+```ts
+type Config = {
+  /** URL to start the crawl */
+  url: string;
+  /** Pattern to match against for links on a page to subsequently crawl */
+  match: string;
+  /** Selector to grab the inner text from */
+  selector: string;
+  /** Don't crawl more than this many pages */
+  maxPagesToCrawl: number;
+  /** File name for the finished data */
+  outputFileName: string;
+  /** Optional function to run for each page found */
+  onVisitPage?: (options: {
+    page: Page;
+    pushData: (data: any) => Promise<void>;
+  }) => Promise<void>;
+};
+```
+
+### Run your crawler
+
+```sh
+npm start
+```
+
+### Upload your data to OpenAI
+
+The crawl will generate a file called `output.json` at the root of this project. Upload that [to OpenAI](https://platform.openai.com/docs/assistants/overview) to create your custom GPT or custom GPT.
+
+## Contributing
+
+Know how to make this project better? Send a PR!
+
+<br>
+<br>
+
+<p align="center">
+   <a href="https://www.builder.io/m/developers">
+      <picture>
+         <source media="(prefers-color-scheme: dark)" srcset="https://user-images.githubusercontent.com/844291/230786554-eb225eeb-2f6b-4286-b8c2-535b1131744a.png">
+         <img width="250" alt="Made with love by Builder.io" src="https://user-images.githubusercontent.com/844291/230786555-a58479e4-75f3-4222-a6eb-74c5af953eac.png">
+       </picture>
+   </a>
+</p>

+ 18 - 6
config.ts

@@ -1,15 +1,27 @@
+import { Page } from "playwright";
+
 type Config = {
+  /** URL to start the crawl */
   url: string;
+  /** Pattern to match against for links on a page to subsequently crawl */
   match: string;
+  /** Selector to grab the inner text from */
   selector: string;
+  /** Don't crawl more than this many pages */
   maxPagesToCrawl: number;
+  /** File name for the finished data */
   outputFileName: string;
+  /** Optional function to run for each page found */
+  onVisitPage?: (options: {
+    page: Page;
+    pushData: (data: any) => Promise<void>;
+  }) => Promise<void>;
 };
 
-export const config = {
-  url: "https://github.com/builderio/builder",
-  match: "https://github.com/BuilderIO/builder/tree/main/**",
-  selector: `#readme,[data-selector="repos-split-pane-content"]`,
+export const config: Config = {
+  url: "https://www.builder.io/c/docs/developers",
+  match: "https://www.builder.io/c/docs/**",
+  selector: `.docs-builder-container`,
   maxPagesToCrawl: 1000,
-  outputFileName: "github.json",
-} satisfies Config;
+  outputFileName: "output.json",
+};

+ 0 - 7
forum.json

@@ -1,7 +0,0 @@
-[
-  {
-    "title": "Builder.io Forum - Help and tips for Builder.io",
-    "url": "https://forum.builder.io/",
-    "html": ""
-  }
-]

Rozdielové dáta súboru neboli zobrazené, pretože súbor je príliš veľký
+ 0 - 3987
github.json


Rozdielové dáta súboru neboli zobrazené, pretože súbor je príliš veľký
+ 0 - 1397
output.json


+ 23 - 24
package.json

@@ -1,26 +1,25 @@
 {
-    "name": "gpt-crawler",
-    "version": "0.0.1",
-    "type": "module",
-    "description": "This is an example of a Crawlee project.",
-    "dependencies": {
-        "crawlee": "^3.0.0",
-        "glob": "^10.3.10",
-        "playwright": "*"
-    },
-    "devDependencies": {
-        "@apify/tsconfig": "^0.1.0",
-        "@types/node": "^20.0.0",
-        "ts-node": "^10.8.0",
-        "typescript": "^5.0.0"
-    },
-    "scripts": {
-        "start": "npm run start:dev",
-        "start:prod": "node dist/main.js",
-        "start:dev": "node --no-warnings=ExperimentalWarning --loader ts-node/esm/transpile-only src/main.ts",
-        "build": "tsc",
-        "test": "echo \"Error: oops, the actor has no tests yet, sad!\" && exit 1"
-    },
-    "author": "It's not you it's me",
-    "license": "ISC"
+  "name": "@builder.io/gpt-crawler",
+  "version": "0.0.1",
+  "type": "module",
+  "description": "Scrape ",
+  "dependencies": {
+    "crawlee": "^3.0.0",
+    "glob": "^10.3.10",
+    "playwright": "*"
+  },
+  "devDependencies": {
+    "@apify/tsconfig": "^0.1.0",
+    "@types/node": "^20.0.0",
+    "ts-node": "^10.8.0",
+    "typescript": "^5.0.0"
+  },
+  "scripts": {
+    "start": "npm run start:dev",
+    "start:prod": "node dist/main.js",
+    "start:dev": "node --no-warnings=ExperimentalWarning --loader ts-node/esm/transpile-only src/main.ts",
+    "build": "tsc"
+  },
+  "author": "It's not you it's me",
+  "license": "ISC"
 }

+ 17 - 7
src/main.ts

@@ -3,6 +3,14 @@ import { PlaywrightCrawler } from "crawlee";
 import { readFile, writeFile } from "fs/promises";
 import { glob } from "glob";
 import { config } from "../config.js";
+import { Page } from "playwright";
+
+export function getPageHtml(page: Page) {
+  return page.evaluate((selector) => {
+    const el = document.querySelector(selector) as HTMLElement | null;
+    return el?.innerText || "";
+  }, config.selector);
+}
 
 if (process.env.NO_CRAWL !== "true") {
   // PlaywrightCrawler crawls the web using a headless
@@ -11,19 +19,21 @@ if (process.env.NO_CRAWL !== "true") {
     // Use the requestHandler to process each of the crawled pages.
     async requestHandler({ request, page, enqueueLinks, log, pushData }) {
       const title = await page.title();
-      log.info(`Title of ${request.loadedUrl} is '${title}'`);
-
-      const html = await page.evaluate(() => {
-        const el = document.querySelector(
-          ".docs-builder-container"
-        ) as HTMLElement | null;
+      log.info(`Crawling ${request.loadedUrl}...`);
 
-        return el?.innerText || "";
+      await page.waitForSelector(config.selector, {
+        timeout: 1000,
       });
 
+      const html = await getPageHtml(page);
+
       // Save results as JSON to ./storage/datasets/default
       await pushData({ title, url: request.loadedUrl, html });
 
+      if (config.onVisitPage) {
+        await config.onVisitPage({ page, pushData });
+      }
+
       // Extract links from the current page
       // and add them to the crawling queue.
       await enqueueLinks({