2 rokov pred · e07b50d022
--- a/README.md
+++ b/README.md
@@ -1,9 +1,79 @@
 
				-# Getting started with Crawlee
			
 
				+# GPT Crawler
			
 
				 
			
 
				-This example uses `PlaywrightCrawler` to recursively crawl https://crawlee.dev using the browser automation library [Playwright](https://playwright.dev).
			
 
				+Crawl a site to generate knowledge files to create your own custom GPT
			
 
				 
			
 
				-You can find more examples and documentation at the following links:
			
 
				+## Get started
			
 
				 
			
 
				-- [Step-by-step tutorial](https://crawlee.dev/docs/introduction) for Crawlee
			
 
				-- `PlaywrightCrawler` [API documentation](https://crawlee.dev/api/playwright-crawler/class/PlaywrightCrawler)
			
 
				-- Other [examples](https://crawlee.dev/docs/examples/playwright-crawler)
			
 
				+### Prerequisites
			
 
				+
			
 
				+Be sure you have Node.js >= 16 installed
			
 
				+
			
 
				+### Clone the repo
			
 
				+
			
 
				+```sh
			
 
				+git clone https://github.com/bridgeproject/gpt-crawler
			
 
				+```
			
 
				+
			
 
				+### Configure the crawler
			
 
				+
			
 
				+Open [config.ts](config.ts) and edit the `url` and `selectors` properties to match your needs.
			
 
				+
			
 
				+E.g. to crawl the Builder.io docs to make our custom GPT you can use:
			
 
				+
			
 
				+```ts
			
 
				+export const config: Config = {
			
 
				+  url: "https://www.builder.io/c/docs/developers",
			
 
				+  match: "https://www.builder.io/c/docs/**",
			
 
				+  selector: `.docs-builder-container`,
			
 
				+  maxPagesToCrawl: 1000,
			
 
				+  outputFileName: "output.json",
			
 
				+};
			
 
				+```
			
 
				+
			
 
				+See the top of the file for the type definition for what you can configure:
			
 
				+
			
 
				+```ts
			
 
				+type Config = {
			
 
				+  /** URL to start the crawl */
			
 
				+  url: string;
			
 
				+  /** Pattern to match against for links on a page to subsequently crawl */
			
 
				+  match: string;
			
 
				+  /** Selector to grab the inner text from */
			
 
				+  selector: string;
			
 
				+  /** Don't crawl more than this many pages */
			
 
				+  maxPagesToCrawl: number;
			
 
				+  /** File name for the finished data */
			
 
				+  outputFileName: string;
			
 
				+  /** Optional function to run for each page found */
			
 
				+  onVisitPage?: (options: {
			
 
				+    page: Page;
			
 
				+    pushData: (data: any) => Promise<void>;
			
 
				+  }) => Promise<void>;
			
 
				+};
			
 
				+```
			
 
				+
			
 
				+### Run your crawler
			
 
				+
			
 
				+```sh
			
 
				+npm start
			
 
				+```
			
 
				+
			
 
				+### Upload your data to OpenAI
			
 
				+
			
 
				+The crawl will generate a file called `output.json` at the root of this project. Upload that [to OpenAI](https://platform.openai.com/docs/assistants/overview) to create your custom GPT or custom GPT.
			
 
				+
			
 
				+## Contributing
			
 
				+
			
 
				+Know how to make this project better? Send a PR!
			
 
				+
			
 
				+<br>
			
 
				+<br>
			
 
				+
			
 
				+<p align="center">
			
 
				+   <a href="https://www.builder.io/m/developers">
			
 
				+      <picture>
			
 
				+         <source media="(prefers-color-scheme: dark)" srcset="https://user-images.githubusercontent.com/844291/230786554-eb225eeb-2f6b-4286-b8c2-535b1131744a.png">
			
 
				+         <img width="250" alt="Made with love by Builder.io" src="https://user-images.githubusercontent.com/844291/230786555-a58479e4-75f3-4222-a6eb-74c5af953eac.png">
			
 
				+       </picture>
			
 
				+   </a>
			
 
				+</p>
			
--- a/config.ts
+++ b/config.ts
@@ -1,15 +1,27 @@
 
				+import { Page } from "playwright";
			
 
				+
			
 
				 type Config = {
			
 
				+  /** URL to start the crawl */
			
 
				   url: string;
			
 
				+  /** Pattern to match against for links on a page to subsequently crawl */
			
 
				   match: string;
			
 
				+  /** Selector to grab the inner text from */
			
 
				   selector: string;
			
 
				+  /** Don't crawl more than this many pages */
			
 
				   maxPagesToCrawl: number;
			
 
				+  /** File name for the finished data */
			
 
				   outputFileName: string;
			
 
				+  /** Optional function to run for each page found */
			
 
				+  onVisitPage?: (options: {
			
 
				+    page: Page;
			
 
				+    pushData: (data: any) => Promise<void>;
			
 
				+  }) => Promise<void>;
			
 
				 };
			
 
				 
			
 
				-export const config = {
			
 
				-  url: "https://github.com/builderio/builder",
			
 
				-  match: "https://github.com/BuilderIO/builder/tree/main/**",
			
 
				-  selector: `#readme,[data-selector="repos-split-pane-content"]`,
			
 
				+export const config: Config = {
			
 
				+  url: "https://www.builder.io/c/docs/developers",
			
 
				+  match: "https://www.builder.io/c/docs/**",
			
 
				+  selector: `.docs-builder-container`,
			
 
				   maxPagesToCrawl: 1000,
			
 
				-  outputFileName: "github.json",
			
 
				-} satisfies Config;
			
 
				+  outputFileName: "output.json",
			
 
				+};
			
--- a/forum.json
+++ b/forum.json
@@ -1,7 +0,0 @@
 
				-[
			
 
				-  {
			
 
				-    "title": "Builder.io Forum - Help and tips for Builder.io",
			
 
				-    "url": "https://forum.builder.io/",
			
 
				-    "html": ""
			
 
				-  }
			
 
				-]
			
--- a/github.json
+++ b/github.json
--- a/output.json
+++ b/output.json
--- a/package.json
+++ b/package.json
@@ -1,26 +1,25 @@
 
				 {
			
 
				-    "name": "gpt-crawler",
			
 
				-    "version": "0.0.1",
			
 
				-    "type": "module",
			
 
				-    "description": "This is an example of a Crawlee project.",
			
 
				-    "dependencies": {
			
 
				-        "crawlee": "^3.0.0",
			
 
				-        "glob": "^10.3.10",
			
 
				-        "playwright": "*"
			
 
				-    },
			
 
				-    "devDependencies": {
			
 
				-        "@apify/tsconfig": "^0.1.0",
			
 
				-        "@types/node": "^20.0.0",
			
 
				-        "ts-node": "^10.8.0",
			
 
				-        "typescript": "^5.0.0"
			
 
				-    },
			
 
				-    "scripts": {
			
 
				-        "start": "npm run start:dev",
			
 
				-        "start:prod": "node dist/main.js",
			
 
				-        "start:dev": "node --no-warnings=ExperimentalWarning --loader ts-node/esm/transpile-only src/main.ts",
			
 
				-        "build": "tsc",
			
 
				-        "test": "echo \"Error: oops, the actor has no tests yet, sad!\" && exit 1"
			
 
				-    },
			
 
				-    "author": "It's not you it's me",
			
 
				-    "license": "ISC"
			
 
				+  "name": "@builder.io/gpt-crawler",
			
 
				+  "version": "0.0.1",
			
 
				+  "type": "module",
			
 
				+  "description": "Scrape ",
			
 
				+  "dependencies": {
			
 
				+    "crawlee": "^3.0.0",
			
 
				+    "glob": "^10.3.10",
			
 
				+    "playwright": "*"
			
 
				+  },
			
 
				+  "devDependencies": {
			
 
				+    "@apify/tsconfig": "^0.1.0",
			
 
				+    "@types/node": "^20.0.0",
			
 
				+    "ts-node": "^10.8.0",
			
 
				+    "typescript": "^5.0.0"
			
 
				+  },
			
 
				+  "scripts": {
			
 
				+    "start": "npm run start:dev",
			
 
				+    "start:prod": "node dist/main.js",
			
 
				+    "start:dev": "node --no-warnings=ExperimentalWarning --loader ts-node/esm/transpile-only src/main.ts",
			
 
				+    "build": "tsc"
			
 
				+  },
			
 
				+  "author": "It's not you it's me",
			
 
				+  "license": "ISC"
			
 
				 }
			
--- a/src/main.ts
+++ b/src/main.ts
@@ -3,6 +3,14 @@ import { PlaywrightCrawler } from "crawlee";
 
				 import { readFile, writeFile } from "fs/promises";
			
 
				 import { glob } from "glob";
			
 
				 import { config } from "../config.js";
			
 
				+import { Page } from "playwright";
			
 
				+
			
 
				+export function getPageHtml(page: Page) {
			
 
				+  return page.evaluate((selector) => {
			
 
				+    const el = document.querySelector(selector) as HTMLElement | null;
			
 
				+    return el?.innerText || "";
			
 
				+  }, config.selector);
			
 
				+}
			
 
				 
			
 
				 if (process.env.NO_CRAWL !== "true") {
			
 
				   // PlaywrightCrawler crawls the web using a headless
			
@@ -11,19 +19,21 @@ if (process.env.NO_CRAWL !== "true") {
 
				     // Use the requestHandler to process each of the crawled pages.
			
 
				     async requestHandler({ request, page, enqueueLinks, log, pushData }) {
			
 
				       const title = await page.title();
			
 
				-      log.info(`Title of ${request.loadedUrl} is '${title}'`);
			
 
				-
			
 
				-      const html = await page.evaluate(() => {
			
 
				-        const el = document.querySelector(
			
 
				-          ".docs-builder-container"
			
 
				-        ) as HTMLElement | null;
			
 
				+      log.info(`Crawling ${request.loadedUrl}...`);
			
 
				 
			
 
				-        return el?.innerText || "";
			
 
				+      await page.waitForSelector(config.selector, {
			
 
				+        timeout: 1000,
			
 
				       });
			
 
				 
			
 
				+      const html = await getPageHtml(page);
			
 
				+
			
 
				       // Save results as JSON to ./storage/datasets/default
			
 
				       await pushData({ title, url: request.loadedUrl, html });
			
 
				 
			
 
				+      if (config.onVisitPage) {
			
 
				+        await config.onVisitPage({ page, pushData });
			
 
				+      }
			
 
				+
			
 
				       // Extract links from the current page
			
 
				       // and add them to the crawling queue.
			
 
				       await enqueueLinks({