2 rokov pred · 54fc5ffaa2
--- a/.gitignore
+++ b/.gitignore
@@ -6,3 +6,6 @@ node_modules
 
				 apify_storage
			
 
				 crawlee_storage
			
 
				 storage
			
 
				+
			
 
				+# any output from the crawler
			
 
				+.json
			
--- a/Dockerfile
+++ b/Dockerfile
@@ -8,7 +8,7 @@ FROM apify/actor-node-playwright-chrome:18 AS builder
 
				 COPY --chown=myuser package*.json ./
			
 
				 
			
 
				 # Install all dependencies. Don't audit to speed up the installation.
			
 
				-RUN npm install --include=dev --audit=false
			
 
				+RUN bun install --include=dev --audit=false
			
 
				 
			
 
				 # Next, copy the source files using the user set
			
 
				 # in the base image.
			
@@ -16,7 +16,7 @@ COPY --chown=myuser . ./
 
				 
			
 
				 # Install all dependencies and build the project.
			
 
				 # Don't audit to speed up the installation.
			
 
				-RUN npm run build
			
 
				+RUN bun run build
			
 
				 
			
 
				 # Create final image
			
 
				 FROM apify/actor-node-playwright-chrome:18
			
@@ -31,14 +31,14 @@ COPY --chown=myuser package*.json ./
 
				 # Install NPM packages, skip optional and development dependencies to
			
 
				 # keep the image small. Avoid logging too much and print the dependency
			
 
				 # tree for debugging
			
 
				-RUN npm --quiet set progress=false \
			
 
				-    && npm install --omit=dev --omit=optional \
			
 
				+RUN bun --quiet set progress=false \
			
 
				+    && bun install --omit=dev --omit=optional \
			
 
				     && echo "Installed NPM packages:" \
			
 
				-    && (npm list --omit=dev --all || true) \
			
 
				+    && (bun list --omit=dev --all || true) \
			
 
				     && echo "Node.js version:" \
			
 
				     && node --version \
			
 
				     && echo "NPM version:" \
			
 
				-    && npm --version
			
 
				+    && bun --version
			
 
				 
			
 
				 # Next, copy the remaining files and directories with the source code.
			
 
				 # Since we do this after NPM install, quick build will be really fast
			
@@ -48,4 +48,4 @@ COPY --chown=myuser . ./
 
				 
			
 
				 # Run the image. If you know you won't need headful browsers,
			
 
				 # you can remove the XVFB start script for a micro perf gain.
			
 
				-CMD ./start_xvfb_and_run_cmd.sh && npm run start:prod --silent
			
 
				+CMD ./start_xvfb_and_run_cmd.sh && bun run start:prod --silent
			
--- a/README.md
+++ b/README.md
@@ -4,22 +4,72 @@ Crawl a site to generate knowledge files to create your own custom GPT from one
 
				 
			
 
				 ![Gif showing the crawl run](https://github.com/BuilderIO/gpt-crawler/assets/844291/feb8763a-152b-4708-9c92-013b5c70d2f2)
			
 
				 
			
 
				-
			
 
				 ## Example
			
 
				 
			
 
				-[Here is a custom GPT](https://chat.openai.com/g/g-kywiqipmR-builder-io-assistant) that I quickly made to help answer questions about how to use and integrate [Builder.io](https://www.builder.io) by simply providing the URL to the Builder docs. 
			
 
				+[Here is a custom GPT](https://chat.openai.com/g/g-kywiqipmR-builder-io-assistant) that I quickly made to help answer questions about how to use and integrate [Builder.io](https://www.builder.io) by simply providing the URL to the Builder docs.
			
 
				 
			
 
				 This project crawled the docs and generated the file that I uploaded as the basis for the custom GPT.
			
 
				 
			
 
				-[Try it out yourself](https://chat.openai.com/g/g-kywiqipmR-builder-io-assistant) by asking questions about how to integrate Builder.io into a site. 
			
 
				+[Try it out yourself](https://chat.openai.com/g/g-kywiqipmR-builder-io-assistant) by asking questions about how to integrate Builder.io into a site.
			
 
				 
			
 
				 > Note that you may need a paid ChatGPT plan to access this feature
			
 
				 
			
 
				 ## Get started
			
 
				 
			
 
				+### Install
			
 
				+
			
 
				+```sh
			
 
				+npm i -g @builder.io/gpt-crawler
			
 
				+```
			
 
				+
			
 
				+### Run
			
 
				+
			
 
				+```sh
			
 
				+gpt-crawler --url https://www.builder.io/c/docs/developers --match https://www.builder.io/c/docs/** --selector .docs-builder-container --maxPagesToCrawl 50 --outputFileName output.json
			
 
				+```
			
 
				+
			
 
				+### Upload your data to OpenAI
			
 
				+
			
 
				+The crawl will generate a file called `output.json` at the root of this project. Upload that [to OpenAI](https://platform.openai.com/docs/assistants/overview) to create your custom assistant or custom GPT.
			
 
				+
			
 
				+#### Create a custom GPT
			
 
				+
			
 
				+Use this option for UI access to your generated knowledge that you can easily share with others
			
 
				+
			
 
				+> Note: you may need a paid ChatGPT plan to create and use custom GPTs right now
			
 
				+
			
 
				+1. Go to [https://chat.openai.com/](https://chat.openai.com/)
			
 
				+2. Click your name in the bottom left corner
			
 
				+3. Choose "My GPTs" in the menu
			
 
				+4. Choose "Create a GPT"
			
 
				+5. Choose "Configure"
			
 
				+6. Under "Knowledge" choose "Upload a file" and upload the file you generated
			
 
				+
			
 
				+![Gif of how to upload a custom GPT](https://github.com/BuilderIO/gpt-crawler/assets/844291/22f27fb5-6ca5-4748-9edd-6bcf00b408cf)
			
 
				+
			
 
				+#### Create a custom assistant
			
 
				+
			
 
				+Use this option for API access to your generated knowledge that you can integrate into your product.
			
 
				+
			
 
				+1. Go to [https://platform.openai.com/assistants](https://platform.openai.com/assistants)
			
 
				+2. Click "+ Create"
			
 
				+3. Choose "upload" and upload the file you generated
			
 
				+
			
 
				+![Gif of how to upload to an assistant](https://github.com/BuilderIO/gpt-crawler/assets/844291/06e6ad36-e2ba-4c6e-8d5a-bf329140de49)
			
 
				+
			
 
				+## (Alternate method) Running in a container with Docker
			
 
				+
			
 
				+To obtain the `output.json` with a containerized execution. Go into the `containerapp` directory. Modify the `config.ts` same as above, the `output.json`file should be generated in the data folder. Note : the `outputFileName` property in the `config.ts` file in containerapp folder is configured to work with the container.
			
 
				+
			
 
				+## Contributing
			
 
				+
			
 
				+Know how to make this project better? Send a PR!
			
 
				+
			
 
				+## Get started developing
			
 
				+
			
 
				 ### Prerequisites
			
 
				 
			
 
				-Be sure you have Node.js >= 16 installed
			
 
				+Be sure you have Node.js >= 16 installed along with [bun](https://bun.sh/)
			
 
				 
			
 
				 ### Clone the repo
			
 
				 
			
@@ -30,17 +80,12 @@ git clone https://github.com/builderio/gpt-crawler
 
				 ### Install Dependencies
			
 
				 
			
 
				 ```sh
			
 
				-npm i
			
 
				-```
			
 
				-
			
 
				-If you do not have Playwright installed:
			
 
				-```sh
			
 
				-npx playwright install
			
 
				+bun i
			
 
				 ```
			
 
				 
			
 
				-### Configure the crawler
			
 
				+### Running GPT Crawler with a hardcoded configuration file
			
 
				 
			
 
				-Open [config.ts](config.ts) and edit the `url` and `selectors` properties to match your needs.
			
 
				+Open [hardcoded.ts](./src/hardcoded.ts) and edit the `url`, `match` and `selectors` properties to match your needs.
			
 
				 
			
 
				 E.g. to crawl the Builder.io docs to make our custom GPT you can use:
			
 
				 
			
@@ -54,7 +99,7 @@ export const config: Config = {
 
				 };
			
 
				 ```
			
 
				 
			
 
				-See the top of the file for the type definition for what you can configure:
			
 
				+See the top of the [config.ts](./config.ts) file for the type definition for what you can configure:
			
 
				 
			
 
				 ```ts
			
 
				 type Config = {
			
@@ -68,62 +113,22 @@ type Config = {
 
				   maxPagesToCrawl: number;
			
 
				   /** File name for the finished data */
			
 
				   outputFileName: string;
			
 
				-  /** Optional cookie to be set. E.g. for Cookie Consent */
			
 
				-  cookie?: {name: string; value: string}
			
 
				   /** Optional function to run for each page found */
			
 
				   onVisitPage?: (options: {
			
 
				     page: Page;
			
 
				     pushData: (data: any) => Promise<void>;
			
 
				   }) => Promise<void>;
			
 
				-    /** Optional timeout for waiting for a selector to appear */
			
 
				-    waitForSelectorTimeout?: number;
			
 
				+  /** Optional timeout for waiting for a selector to appear */
			
 
				+  waitForSelectorTimeout?: number;
			
 
				 };
			
 
				 ```
			
 
				 
			
 
				-### Run your crawler
			
 
				+#### Run your crawler
			
 
				 
			
 
				 ```sh
			
 
				-npm start
			
 
				+bun start
			
 
				 ```
			
 
				 
			
 
				-### Upload your data to OpenAI
			
 
				-
			
 
				-The crawl will generate a file called `output.json` at the root of this project. Upload that [to OpenAI](https://platform.openai.com/docs/assistants/overview) to create your custom assistant or custom GPT.
			
 
				-
			
 
				-#### Create a custom GPT
			
 
				-
			
 
				-Use this option for UI access to your generated knowledge that you can easily share with others
			
 
				-
			
 
				-> Note: you may need a paid ChatGPT plan to create and use custom GPTs right now
			
 
				-
			
 
				-1. Go to [https://chat.openai.com/](https://chat.openai.com/)
			
 
				-2. Click your name in the bottom left corner
			
 
				-3. Choose "My GPTs" in the menu
			
 
				-4. Choose "Create a GPT"
			
 
				-5. Choose "Configure"
			
 
				-6. Under "Knowledge" choose "Upload a file" and upload the file you generated
			
 
				-
			
 
				-![Gif of how to upload a custom GPT](https://github.com/BuilderIO/gpt-crawler/assets/844291/22f27fb5-6ca5-4748-9edd-6bcf00b408cf)
			
 
				-
			
 
				-
			
 
				-#### Create a custom assistant
			
 
				-
			
 
				-Use this option for API access to your generated knowledge that you can integrate into your product.
			
 
				-
			
 
				-1. Go to [https://platform.openai.com/assistants](https://platform.openai.com/assistants)
			
 
				-2. Click "+ Create"
			
 
				-3. Choose "upload" and upload the file you generated
			
 
				-
			
 
				-![Gif of how to upload to an assistant](https://github.com/BuilderIO/gpt-crawler/assets/844291/06e6ad36-e2ba-4c6e-8d5a-bf329140de49)
			
 
				-
			
 
				-## (Alternate method) Running in a container with Docker
			
 
				-To obtain the `output.json` with a containerized execution. Go into the `containerapp` directory. Modify the `config.ts` same as above, the `output.json`file should be generated in the data folder. Note : the `outputFileName` property in the `config.ts` file in containerapp folder is configured to work with the container. 
			
 
				-
			
 
				-
			
 
				-## Contributing
			
 
				-
			
 
				-Know how to make this project better? Send a PR!
			
 
				-
			
 
				 <br>
			
 
				 <br>
			
 
				 
			
--- a/bun.lockb
+++ b/bun.lockb
--- a/config.ts
+++ b/config.ts
@@ -1,5 +1,6 @@
 
				 import { Page } from "playwright";
			
 
				-type Config = {
			
 
				+
			
 
				+export type Config = {
			
 
				   /** URL to start the crawl */
			
 
				   url: string;
			
 
				   /** Pattern to match against for links on a page to subsequently crawl */
			
@@ -20,11 +21,3 @@ type Config = {
 
				   /** Optional timeout for waiting for a selector to appear */
			
 
				   waitForSelectorTimeout?: number;
			
 
				 };
			
 
				-
			
 
				-export const config: Config = {
			
 
				-  url: "https://www.builder.io/c/docs/developers",
			
 
				-  match: "https://www.builder.io/c/docs/**",
			
 
				-  selector: `.docs-builder-container`,
			
 
				-  maxPagesToCrawl: 50,
			
 
				-  outputFileName: "output.json",
			
 
				-};
			
--- a/package.json
+++ b/package.json
@@ -2,22 +2,30 @@
 
				   "name": "@builder.io/gpt-crawler",
			
 
				   "version": "0.0.1",
			
 
				   "type": "module",
			
 
				+  "bin": {
			
 
				+    "gpt-crawler": "./dist/src/cli.js"
			
 
				+  },
			
 
				   "description": "Crawl a site to generate knowledge files to create your own custom GPT",
			
 
				   "dependencies": {
			
 
				+    "commander": "^11.1.0",
			
 
				     "crawlee": "^3.0.0",
			
 
				     "glob": "^10.3.10",
			
 
				+    "inquirer": "^9.2.12",
			
 
				     "playwright": "*"
			
 
				   },
			
 
				   "devDependencies": {
			
 
				     "@apify/tsconfig": "^0.1.0",
			
 
				+    "@types/inquirer": "^9.0.7",
			
 
				     "@types/node": "^20.0.0",
			
 
				     "ts-node": "^10.8.0",
			
 
				     "typescript": "^5.0.0"
			
 
				   },
			
 
				   "scripts": {
			
 
				-    "start": "npm run start:dev",
			
 
				-    "start:prod": "node dist/main.js",
			
 
				-    "start:dev": "node --no-warnings=ExperimentalWarning --loader ts-node/esm/transpile-only src/main.ts",
			
 
				+    "preinstall": "bunx playwright install",
			
 
				+    "start": "bun run start:dev",
			
 
				+    "start:cli": "NODE_ENV=development bun run build && node dist/src/cli.js",
			
 
				+    "start:prod": "node dist/src/main.js",
			
 
				+    "start:dev": "bun run build && node --no-warnings=ExperimentalWarning dist/src/main.js",
			
 
				     "build": "tsc"
			
 
				   },
			
 
				   "author": "It's not you it's me",
			
--- a/src/cli.ts
+++ b/src/cli.ts
@@ -0,0 +1,66 @@
 
				+#!/usr/bin/env node
			
 
				+
			
 
				+import { program } from "commander";
			
 
				+import { Config } from "../config.js";
			
 
				+import { crawl, write } from "./core.js";
			
 
				+import { createRequire } from "node:module";
			
 
				+import inquirer from "inquirer";
			
 
				+
			
 
				+const require = createRequire(import.meta.url);
			
 
				+const { version, description } = require("../../package.json");
			
 
				+
			
 
				+async function handler(options: any) {
			
 
				+  try {
			
 
				+    let config: Config = {
			
 
				+      url: options.url,
			
 
				+      match: options.match,
			
 
				+      selector: options.selector,
			
 
				+      maxPagesToCrawl: 50,
			
 
				+      outputFileName: options.outputFileName ?? "output.json",
			
 
				+    };
			
 
				+
			
 
				+    if (!config.url || !config.match || !config.selector) {
			
 
				+      const { url, match, selector } = await inquirer
			
 
				+        .prompt([
			
 
				+          {
			
 
				+            type: "input",
			
 
				+            name: "url",
			
 
				+            message: "What is the URL of the website you want to crawl?",
			
 
				+          },
			
 
				+          {
			
 
				+            type: "input",
			
 
				+            name: "match",
			
 
				+            message: "What is the URL pattern you want to match?",
			
 
				+          },
			
 
				+          {
			
 
				+            type: "input",
			
 
				+            name: "selector",
			
 
				+            message: "What is the CSS selector you want to match?",
			
 
				+          },
			
 
				+        ]);
			
 
				+
			
 
				+      config.url = url;
			
 
				+      config.match = match;
			
 
				+      config.selector = selector;
			
 
				+    }
			
 
				+
			
 
				+    await crawl(config);
			
 
				+    await write(config);
			
 
				+  } catch (error) {
			
 
				+    console.log(error);
			
 
				+  }
			
 
				+}
			
 
				+
			
 
				+program
			
 
				+  .version(version)
			
 
				+  .description(description);
			
 
				+
			
 
				+program
			
 
				+  .option("-u, --url")
			
 
				+  .option("-m, --match")
			
 
				+  .option("-s, --selector")
			
 
				+  .option("-m, --maxPagesToCrawl")
			
 
				+  .option("-o, --outputFileName")
			
 
				+  .action(handler);
			
 
				+
			
 
				+program.parse();
			
--- a/src/core.ts
+++ b/src/core.ts
@@ -0,0 +1,100 @@
 
				+// For more information, see https://crawlee.dev/
			
 
				+import { PlaywrightCrawler } from "crawlee";
			
 
				+import { readFile, writeFile } from "fs/promises";
			
 
				+import { glob } from "glob";
			
 
				+import { Config } from "../config";
			
 
				+import { Page } from "playwright";
			
 
				+
			
 
				+let pageCounter = 0; 
			
 
				+
			
 
				+export function getPageHtml(page: Page, selector: string) {
			
 
				+  return page.evaluate((selector) => {
			
 
				+    // Check if the selector is an XPath
			
 
				+    if (selector.startsWith('/')) {
			
 
				+      const elements = document.evaluate(selector, document, null, XPathResult.ANY_TYPE, null);
			
 
				+      let result = elements.iterateNext();
			
 
				+      return result ? result.textContent || "" : "";
			
 
				+    } else {
			
 
				+      // Handle as a CSS selector
			
 
				+      const el = document.querySelector(selector) as HTMLElement | null;
			
 
				+      return el?.innerText || "";
			
 
				+    }
			
 
				+  }, selector);
			
 
				+}
			
 
				+
			
 
				+export async function waitForXPath(page: Page, xpath: string, timeout: number) {
			
 
				+  await page.waitForFunction(xpath => {
			
 
				+    const elements = document.evaluate(xpath, document, null, XPathResult.ANY_TYPE, null);
			
 
				+    return elements.iterateNext() !== null;
			
 
				+  }, xpath, { timeout });
			
 
				+}
			
 
				+
			
 
				+export async function crawl(config: Config) {
			
 
				+  if (process.env.NO_CRAWL !== "true") {
			
 
				+    // PlaywrightCrawler crawls the web using a headless
			
 
				+    // browser controlled by the Playwright library.
			
 
				+    const crawler = new PlaywrightCrawler({
			
 
				+      // Use the requestHandler to process each of the crawled pages.
			
 
				+      async requestHandler({ request, page, enqueueLinks, log, pushData }) {
			
 
				+        if (config.cookie) {
			
 
				+          // Set the cookie for the specific URL
			
 
				+          const cookie = {
			
 
				+            name: config.cookie.name,
			
 
				+            value: config.cookie.value,
			
 
				+            url: request.loadedUrl, 
			
 
				+          };
			
 
				+          await page.context().addCookies([cookie]);
			
 
				+        }
			
 
				+  
			
 
				+        const title = await page.title();
			
 
				+        pageCounter++;
			
 
				+        log.info(`Crawling: Page ${pageCounter} / ${config.maxPagesToCrawl} - URL: ${request.loadedUrl}...`);
			
 
				+        
			
 
				+        // Use custom handling for XPath selector
			
 
				+        if (config.selector.startsWith('/')) {
			
 
				+          await waitForXPath(page, config.selector, config.waitForSelectorTimeout ?? 1000);
			
 
				+        } else {
			
 
				+          await page.waitForSelector(config.selector, {
			
 
				+            timeout: config.waitForSelectorTimeout ?? 1000,
			
 
				+          });
			
 
				+        }
			
 
				+  
			
 
				+        const html = await getPageHtml(page, config.selector);
			
 
				+  
			
 
				+        // Save results as JSON to ./storage/datasets/default
			
 
				+        await pushData({ title, url: request.loadedUrl, html });
			
 
				+  
			
 
				+        if (config.onVisitPage) {
			
 
				+          await config.onVisitPage({ page, pushData });
			
 
				+        }
			
 
				+  
			
 
				+        // Extract links from the current page
			
 
				+        // and add them to the crawling queue.
			
 
				+        await enqueueLinks({
			
 
				+          globs: typeof config.match === "string" ? [config.match] : config.match,
			
 
				+        });
			
 
				+      },
			
 
				+      // Comment this option to scrape the full website.
			
 
				+      maxRequestsPerCrawl: config.maxPagesToCrawl,
			
 
				+      // Uncomment this option to see the browser window.
			
 
				+      // headless: false,
			
 
				+    });
			
 
				+  
			
 
				+    // Add first URL to the queue and start the crawl.
			
 
				+    await crawler.run([config.url]);
			
 
				+  }  
			
 
				+}
			
 
				+
			
 
				+export async function write(config: Config) {
			
 
				+  const jsonFiles = await glob("storage/datasets/default/*.json", {
			
 
				+    absolute: true,
			
 
				+  });
			
 
				+  
			
 
				+  const results = [];
			
 
				+  for (const file of jsonFiles) {
			
 
				+    const data = JSON.parse(await readFile(file, "utf-8"));
			
 
				+    results.push(data);
			
 
				+  }
			
 
				+  
			
 
				+  await writeFile(config.outputFileName, JSON.stringify(results, null, 2));
			
 
				+}
			
--- a/src/hardcoded.ts
+++ b/src/hardcoded.ts
@@ -0,0 +1,13 @@
 
				+import { Config } from "../config.js";
			
 
				+import { crawl, write } from "./core.js";
			
 
				+
			
 
				+const hardcodedConfig: Config = {
			
 
				+  url: "https://www.builder.io/c/docs/developers",
			
 
				+  match: "https://www.builder.io/c/docs/**",
			
 
				+  selector: `.docs-builder-container`,
			
 
				+  maxPagesToCrawl: 50,
			
 
				+  outputFileName: "output.json",
			
 
				+};
			
 
				+
			
 
				+await crawl(hardcodedConfig);
			
 
				+await write(hardcodedConfig);
			
--- a/src/main.ts
+++ b/src/main.ts
@@ -1,97 +1,3 @@
 
				-// For more information, see https://crawlee.dev/
			
 
				-import { PlaywrightCrawler } from "crawlee";
			
 
				-import { readFile, writeFile } from "fs/promises";
			
 
				-import { glob } from "glob";
			
 
				-import { config } from "../config.js";
			
 
				-import { Page } from "playwright";
			
 
				-
			
 
				-let pageCounter = 0; 
			
 
				-
			
 
				-export function getPageHtml(page: Page, selector: string) {
			
 
				-  return page.evaluate((selector) => {
			
 
				-    // Check if the selector is an XPath
			
 
				-    if (selector.startsWith('/')) {
			
 
				-      const elements = document.evaluate(selector, document, null, XPathResult.ANY_TYPE, null);
			
 
				-      let result = elements.iterateNext();
			
 
				-      return result ? result.textContent || "" : "";
			
 
				-    } else {
			
 
				-      // Handle as a CSS selector
			
 
				-      const el = document.querySelector(selector) as HTMLElement | null;
			
 
				-      return el?.innerText || "";
			
 
				-    }
			
 
				-  }, selector);
			
 
				-}
			
 
				-
			
 
				-export async function waitForXPath(page: Page, xpath: string, timeout: number) {
			
 
				-  await page.waitForFunction(xpath => {
			
 
				-    const elements = document.evaluate(xpath, document, null, XPathResult.ANY_TYPE, null);
			
 
				-    return elements.iterateNext() !== null;
			
 
				-  }, xpath, { timeout });
			
 
				-}
			
 
				-
			
 
				-if (process.env.NO_CRAWL !== "true") {
			
 
				-  // PlaywrightCrawler crawls the web using a headless
			
 
				-  // browser controlled by the Playwright library.
			
 
				-  const crawler = new PlaywrightCrawler({
			
 
				-    // Use the requestHandler to process each of the crawled pages.
			
 
				-    async requestHandler({ request, page, enqueueLinks, log, pushData }) {
			
 
				-
			
 
				-      if (config.cookie) {
			
 
				-        // Set the cookie for the specific URL
			
 
				-        const cookie = {
			
 
				-          name: config.cookie.name,
			
 
				-          value: config.cookie.value,
			
 
				-          url: request.loadedUrl, 
			
 
				-        };
			
 
				-        await page.context().addCookies([cookie]);
			
 
				-      }
			
 
				-
			
 
				-      const title = await page.title();
			
 
				-      pageCounter++;
			
 
				-      log.info(`Crawling: Page ${pageCounter} / ${config.maxPagesToCrawl} - URL: ${request.loadedUrl}...`);
			
 
				-      
			
 
				-      // Use custom handling for XPath selector
			
 
				-      if (config.selector.startsWith('/')) {
			
 
				-        await waitForXPath(page, config.selector, config.waitForSelectorTimeout ?? 1000);
			
 
				-      } else {
			
 
				-        await page.waitForSelector(config.selector, {
			
 
				-          timeout: config.waitForSelectorTimeout ?? 1000,
			
 
				-        });
			
 
				-      }
			
 
				-
			
 
				-      const html = await getPageHtml(page, config.selector);
			
 
				-
			
 
				-      // Save results as JSON to ./storage/datasets/default
			
 
				-      await pushData({ title, url: request.loadedUrl, html });
			
 
				-
			
 
				-      if (config.onVisitPage) {
			
 
				-        await config.onVisitPage({ page, pushData });
			
 
				-      }
			
 
				-
			
 
				-      // Extract links from the current page
			
 
				-      // and add them to the crawling queue.
			
 
				-      await enqueueLinks({
			
 
				-        globs: typeof config.match === "string" ? [config.match] : config.match,
			
 
				-      });
			
 
				-    },
			
 
				-    // Comment this option to scrape the full website.
			
 
				-    maxRequestsPerCrawl: config.maxPagesToCrawl,
			
 
				-    // Uncomment this option to see the browser window.
			
 
				-    // headless: false,
			
 
				-  });
			
 
				-
			
 
				-  // Add first URL to the queue and start the crawl.
			
 
				-  await crawler.run([config.url]);
			
 
				-}
			
 
				-
			
 
				-const jsonFiles = await glob("storage/datasets/default/*.json", {
			
 
				-  absolute: true,
			
 
				-});
			
 
				-
			
 
				-const results = [];
			
 
				-for (const file of jsonFiles) {
			
 
				-  const data = JSON.parse(await readFile(file, "utf-8"));
			
 
				-  results.push(data);
			
 
				-}
			
 
				-
			
 
				-await writeFile(config.outputFileName, JSON.stringify(results, null, 2));
			
 
				+export * from "./core.js";
			
 
				+export * from "./cli.js";
			
 
				+export * from "./hardcoded.js";
			
--- a/tsconfig.json
+++ b/tsconfig.json
@@ -6,6 +6,7 @@
 
				     "outDir": "dist",
			
 
				     "resolveJsonModule": true,
			
 
				     "noUnusedLocals": false,
			
 
				+    "skipLibCheck": true,
			
 
				     "lib": ["DOM"]
			
 
				   },
			
 
				   "include": ["./src/**/*", "./config.ts"]