|
@@ -1,9 +1,79 @@
|
|
|
-# Getting started with Crawlee
|
|
|
+# GPT Crawler
|
|
|
|
|
|
-This example uses `PlaywrightCrawler` to recursively crawl https://crawlee.dev using the browser automation library [Playwright](https://playwright.dev).
|
|
|
+Crawl a site to generate knowledge files to create your own custom GPT
|
|
|
|
|
|
-You can find more examples and documentation at the following links:
|
|
|
+## Get started
|
|
|
|
|
|
-- [Step-by-step tutorial](https://crawlee.dev/docs/introduction) for Crawlee
|
|
|
-- `PlaywrightCrawler` [API documentation](https://crawlee.dev/api/playwright-crawler/class/PlaywrightCrawler)
|
|
|
-- Other [examples](https://crawlee.dev/docs/examples/playwright-crawler)
|
|
|
+### Prerequisites
|
|
|
+
|
|
|
+Be sure you have Node.js >= 16 installed
|
|
|
+
|
|
|
+### Clone the repo
|
|
|
+
|
|
|
+```sh
|
|
|
+git clone https://github.com/bridgeproject/gpt-crawler
|
|
|
+```
|
|
|
+
|
|
|
+### Configure the crawler
|
|
|
+
|
|
|
+Open [config.ts](config.ts) and edit the `url` and `selectors` properties to match your needs.
|
|
|
+
|
|
|
+E.g. to crawl the Builder.io docs to make our custom GPT you can use:
|
|
|
+
|
|
|
+```ts
|
|
|
+export const config: Config = {
|
|
|
+ url: "https://www.builder.io/c/docs/developers",
|
|
|
+ match: "https://www.builder.io/c/docs/**",
|
|
|
+ selector: `.docs-builder-container`,
|
|
|
+ maxPagesToCrawl: 1000,
|
|
|
+ outputFileName: "output.json",
|
|
|
+};
|
|
|
+```
|
|
|
+
|
|
|
+See the top of the file for the type definition for what you can configure:
|
|
|
+
|
|
|
+```ts
|
|
|
+type Config = {
|
|
|
+ /** URL to start the crawl */
|
|
|
+ url: string;
|
|
|
+ /** Pattern to match against for links on a page to subsequently crawl */
|
|
|
+ match: string;
|
|
|
+ /** Selector to grab the inner text from */
|
|
|
+ selector: string;
|
|
|
+ /** Don't crawl more than this many pages */
|
|
|
+ maxPagesToCrawl: number;
|
|
|
+ /** File name for the finished data */
|
|
|
+ outputFileName: string;
|
|
|
+ /** Optional function to run for each page found */
|
|
|
+ onVisitPage?: (options: {
|
|
|
+ page: Page;
|
|
|
+ pushData: (data: any) => Promise<void>;
|
|
|
+ }) => Promise<void>;
|
|
|
+};
|
|
|
+```
|
|
|
+
|
|
|
+### Run your crawler
|
|
|
+
|
|
|
+```sh
|
|
|
+npm start
|
|
|
+```
|
|
|
+
|
|
|
+### Upload your data to OpenAI
|
|
|
+
|
|
|
+The crawl will generate a file called `output.json` at the root of this project. Upload that [to OpenAI](https://platform.openai.com/docs/assistants/overview) to create your custom GPT or custom GPT.
|
|
|
+
|
|
|
+## Contributing
|
|
|
+
|
|
|
+Know how to make this project better? Send a PR!
|
|
|
+
|
|
|
+<br>
|
|
|
+<br>
|
|
|
+
|
|
|
+<p align="center">
|
|
|
+ <a href="https://www.builder.io/m/developers">
|
|
|
+ <picture>
|
|
|
+ <source media="(prefers-color-scheme: dark)" srcset="https://user-images.githubusercontent.com/844291/230786554-eb225eeb-2f6b-4286-b8c2-535b1131744a.png">
|
|
|
+ <img width="250" alt="Made with love by Builder.io" src="https://user-images.githubusercontent.com/844291/230786555-a58479e4-75f3-4222-a6eb-74c5af953eac.png">
|
|
|
+ </picture>
|
|
|
+ </a>
|
|
|
+</p>
|