Ver código fonte

Merge branch 'main' into sitemap-support

guillermoscript 1 ano atrás
pai
commit
7f771aa290
16 arquivos alterados com 3904 adições e 3784 exclusões
  1. 3 0
      .gitignore
  2. 1 1
      Dockerfile
  3. 32 32
      README.md
  4. 2 32
      config.ts
  5. 35 0
      containerapp/Dockerfile
  6. 14 0
      containerapp/README.md
  7. 31 0
      containerapp/data/config.ts
  8. 11 0
      containerapp/data/init.sh
  9. 16 0
      containerapp/run.sh
  10. 3477 3601
      package-lock.json
  11. 13 3
      package.json
  12. 97 0
      src/cli.ts
  13. 41 0
      src/config.ts
  14. 125 0
      src/core.ts
  15. 4 114
      src/main.ts
  16. 2 1
      tsconfig.json

+ 3 - 0
.gitignore

@@ -6,3 +6,6 @@ node_modules
 apify_storage
 crawlee_storage
 storage
+
+# any output from the crawler
+*.json

+ 1 - 1
Dockerfile

@@ -48,4 +48,4 @@ COPY --chown=myuser . ./
 
 # Run the image. If you know you won't need headful browsers,
 # you can remove the XVFB start script for a micro perf gain.
-CMD ./start_xvfb_and_run_cmd.sh && npm run start:prod --silent
+CMD ./start_xvfb_and_run_cmd.sh && npm run start:prod --silent

Diferenças do arquivo suprimidas por serem muito extensas
+ 32 - 32
README.md


Diferenças do arquivo suprimidas por serem muito extensas
+ 2 - 32
config.ts


+ 35 - 0
containerapp/Dockerfile

@@ -0,0 +1,35 @@
+FROM ubuntu:jammy
+
+# Install Git
+RUN apt-get update && \
+    apt-get install sudo -y && \
+    apt-get install git -y
+
+# Install Docker
+RUN apt-get install ca-certificates curl gnupg -y && \
+    curl -fsSL https://download.docker.com/linux/ubuntu/gpg | gpg --dearmor -o /etc/apt/keyrings/docker.gpg && \
+    echo "deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.gpg] https://download.docker.com/linux/ubuntu $(. /etc/os-release && echo "$VERSION_CODENAME") stable" | tee /etc/apt/sources.list.d/docker.list > /dev/null && \
+    apt-get update && \
+    apt-get install docker-ce docker-ce-cli containerd.io docker-buildx-plugin docker-compose-plugin -y
+
+# Install Nodejs v20 npm
+RUN sudo apt-get update && \
+    sudo apt-get install -y ca-certificates curl gnupg && \
+    sudo mkdir -p /etc/apt/keyrings && \
+    curl -fsSL https://deb.nodesource.com/gpgkey/nodesource-repo.gpg.key | sudo gpg --dearmor -o /etc/apt/keyrings/nodesource.gpg 
+
+RUN echo "deb [signed-by=/etc/apt/keyrings/nodesource.gpg] https://deb.nodesource.com/node_20.x nodistro main" | sudo tee /etc/apt/sources.list.d/nodesource.list && \
+    sudo apt-get update && \
+    sudo apt-get install nodejs -y
+
+# Install gpt-crawler
+RUN cd /home && git clone https://github.com/builderio/gpt-crawler && cd gpt-crawler && \
+    npm i && \
+    npx playwright install && \
+    npx playwright install-deps
+
+# Directory to mount in the docker container to get the output.json data
+RUN cd /home && mkdir data
+
+
+WORKDIR /home

+ 14 - 0
containerapp/README.md

@@ -0,0 +1,14 @@
+# Containerized crawler
+
+## Docker image with packaged crawler, with script for building and execution.
+
+All dependencies set up and configured in the Dockerfile. Requires docker to be installed.
+
+## Get started
+
+### Prerequisites
+
+Be sure you have docker installed
+
+1. `cd gpt-crawler/containerapp `
+2. `. ./run.sh `

+ 31 - 0
containerapp/data/config.ts

@@ -0,0 +1,31 @@
+import { Page } from "playwright";
+
+type Config = {
+  /** URL to start the crawl */
+  url: string;
+  /** Pattern to match against for links on a page to subsequently crawl */
+  match: string;
+  /** Selector to grab the inner text from */
+  selector: string;
+  /** Don't crawl more than this many pages */
+  maxPagesToCrawl: number;
+  /** File name for the finished data */
+  outputFileName: string;
+  /** Optional cookie to be set. E.g. for Cookie Consent */
+  cookie?: { name: string; value: string };
+  /** Optional function to run for each page found */
+  onVisitPage?: (options: {
+    page: Page;
+    pushData: (data: any) => Promise<void>;
+  }) => Promise<void>;
+  /** Optional timeout for waiting for a selector to appear */
+  waitForSelectorTimeout?: number;
+};
+
+export const config: Config = {
+  url: "https://www.builder.io/c/docs/developers",
+  match: "https://www.builder.io/c/docs/**",
+  selector: `.docs-builder-container`,
+  maxPagesToCrawl: 50,
+  outputFileName: "../data/output.json",
+};

+ 11 - 0
containerapp/data/init.sh

@@ -0,0 +1,11 @@
+#!/bin/bash
+
+# copy the config when starting the container
+cp /home/data/config.ts /home/gpt-crawler/
+
+# start the crawler
+cd /home/gpt-crawler && npm start
+
+# Print message after crawling and exit
+echo "Crawling complete.."
+exit

+ 16 - 0
containerapp/run.sh

@@ -0,0 +1,16 @@
+#!/bin/bash
+
+# Check if there is a Docker image named "crawler"
+if ! sudo docker images | grep -w 'crawler' > /dev/null; then
+    echo "Docker repository 'crawler' not found. Building the image..."
+    # Build the Docker image with the name 'crawler'
+    sudo docker build -t crawler .
+else
+    echo "Docker image already built."
+fi
+
+# Ensure that init.sh script is executable
+sudo chmod +x ./data/init.sh
+
+# Starting docker, mount docker.sock to work with docker-in-docker function, mount data directory for input/output from container
+sudo docker run --rm -it -v /var/run/docker.sock:/var/run/docker.sock -v ./data:/home/data crawler bash -c "/home/data/init.sh"

Diferenças do arquivo suprimidas por serem muito extensas
+ 3477 - 3601
package-lock.json


+ 13 - 3
package.json

@@ -2,23 +2,33 @@
   "name": "@builder.io/gpt-crawler",
   "version": "0.0.1",
   "type": "module",
+  "bin": {
+    "gpt-crawler": "./dist/src/cli.js"
+  },
   "description": "Crawl a site to generate knowledge files to create your own custom GPT",
   "dependencies": {
+    "commander": "^11.1.0",
     "crawlee": "^3.0.0",
     "glob": "^10.3.10",
-    "playwright": "*"
+    "inquirer": "^9.2.12",
+    "playwright": "*",
+    "prettier": "^3.1.0"
   },
   "devDependencies": {
     "@apify/tsconfig": "^0.1.0",
+    "@types/inquirer": "^9.0.7",
     "@types/node": "^20.0.0",
     "ts-node": "^10.8.0",
     "typescript": "^5.0.0"
   },
   "scripts": {
+    "preinstall": "npx playwright install",
     "start": "npm run start:dev",
+    "start:cli": "NODE_ENV=development npm run build && node dist/src/cli.js",
+    "start:dev": "NODE_ENV=development npm run build && node dist/src/main.js",
     "start:prod": "node dist/main.js",
-    "start:dev": "node --no-warnings=ExperimentalWarning --loader ts-node/esm/transpile-only src/main.ts",
-    "build": "tsc"
+    "build": "tsc",
+    "fmt": "prettier --write ."
   },
   "author": "It's not you it's me",
   "license": "ISC"

+ 97 - 0
src/cli.ts

@@ -0,0 +1,97 @@
+#!/usr/bin/env node
+
+import { program } from "commander";
+import { Config } from "./config.js";
+import { crawl, write } from "./core.js";
+import { createRequire } from "node:module";
+import inquirer from "inquirer";
+
+const require = createRequire(import.meta.url);
+const { version, description } = require("../../package.json");
+
+const messages = {
+  url: "What is the first URL of the website you want to crawl?",
+  match: "What is the URL pattern you want to match?",
+  selector: "What is the CSS selector you want to match?",
+  maxPagesToCrawl: "How many pages do you want to crawl?",
+  outputFileName: "What is the name of the output file?",
+};
+
+async function handler(options: Config) {
+  try {
+    const {
+      url,
+      match,
+      selector,
+      maxPagesToCrawl: maxPagesToCrawlStr,
+      outputFileName,
+    } = options;
+
+    // @ts-ignore
+    const maxPagesToCrawl = parseInt(maxPagesToCrawlStr, 10);
+
+    let config: Config = {
+      url,
+      match,
+      selector,
+      maxPagesToCrawl,
+      outputFileName,
+    };
+
+    if (!config.url || !config.match || !config.selector) {
+      const questions = [];
+
+      if (!config.url) {
+        questions.push({
+          type: "input",
+          name: "url",
+          message: messages.url,
+        });
+      }
+
+      if (!config.match) {
+        questions.push({
+          type: "input",
+          name: "match",
+          message: messages.match,
+        });
+      }
+
+      if (!config.selector) {
+        questions.push({
+          type: "input",
+          name: "selector",
+          message: messages.selector,
+        });
+      }
+
+      const answers = await inquirer.prompt(questions);
+
+      config = {
+        ...config,
+        ...answers,
+      };
+    }
+
+    await crawl(config);
+    await write(config);
+  } catch (error) {
+    console.log(error);
+  }
+}
+
+program.version(version).description(description);
+
+program
+  .option("-u, --url <string>", messages.url, "")
+  .option("-m, --match <string>", messages.match, "")
+  .option("-s, --selector <string>", messages.selector, "")
+  .option("-m, --maxPagesToCrawl <number>", messages.maxPagesToCrawl, "50")
+  .option(
+    "-o, --outputFileName <string>",
+    messages.outputFileName,
+    "output.json",
+  )
+  .action(handler);
+
+program.parse();

+ 41 - 0
src/config.ts

@@ -0,0 +1,41 @@
+import type { Page } from "playwright";
+
+export type Config = {
+  /**
+   * URL to start the crawl
+   * @example "https://www.builder.io/c/docs/developers"
+   * @default ""
+   */
+  url: string;
+  /**
+   * Pattern to match against for links on a page to subsequently crawl
+   * @example "https://www.builder.io/c/docs/**"
+   * @default ""
+   */
+  match: string | string[];
+  /**
+   * Selector to grab the inner text from
+   * @example ".docs-builder-container"
+   * @default ""
+   */
+  selector?: string;
+  /**
+   * Don't crawl more than this many pages
+   * @default 50
+   */
+  maxPagesToCrawl: number;
+  /**
+   * File name for the finished data
+   * @default "output.json"
+   */
+  outputFileName: string;
+  /** Optional cookie to be set. E.g. for Cookie Consent */
+  cookie?: { name: string; value: string };
+  /** Optional function to run for each page found */
+  onVisitPage?: (options: {
+    page: Page;
+    pushData: (data: any) => Promise<void>;
+  }) => Promise<void>;
+  /** Optional timeout for waiting for a selector to appear */
+  waitForSelectorTimeout?: number;
+};

+ 125 - 0
src/core.ts

@@ -0,0 +1,125 @@
+// For more information, see https://crawlee.dev/
+import { PlaywrightCrawler } from "crawlee";
+import { readFile, writeFile } from "fs/promises";
+import { glob } from "glob";
+import { Config } from "./config.js";
+import { Page } from "playwright";
+
+let pageCounter = 0;
+
+export function getPageHtml(page: Page, selector = "body") {
+  return page.evaluate((selector) => {
+    // Check if the selector is an XPath
+    if (selector.startsWith("/")) {
+      const elements = document.evaluate(
+        selector,
+        document,
+        null,
+        XPathResult.ANY_TYPE,
+        null
+      );
+      let result = elements.iterateNext();
+      return result ? result.textContent || "" : "";
+    } else {
+      // Handle as a CSS selector
+      const el = document.querySelector(selector) as HTMLElement | null;
+      return el?.innerText || "";
+    }
+  }, selector);
+}
+
+export async function waitForXPath(page: Page, xpath: string, timeout: number) {
+  await page.waitForFunction(
+    (xpath) => {
+      const elements = document.evaluate(
+        xpath,
+        document,
+        null,
+        XPathResult.ANY_TYPE,
+        null
+      );
+      return elements.iterateNext() !== null;
+    },
+    xpath,
+    { timeout }
+  );
+}
+
+export async function crawl(config: Config) {
+  if (process.env.NO_CRAWL !== "true") {
+    // PlaywrightCrawler crawls the web using a headless
+    // browser controlled by the Playwright library.
+    const crawler = new PlaywrightCrawler({
+      // Use the requestHandler to process each of the crawled pages.
+      async requestHandler({ request, page, enqueueLinks, log, pushData }) {
+        if (config.cookie) {
+          // Set the cookie for the specific URL
+          const cookie = {
+            name: config.cookie.name,
+            value: config.cookie.value,
+            url: request.loadedUrl,
+          };
+          await page.context().addCookies([cookie]);
+        }
+
+        const title = await page.title();
+        pageCounter++;
+        log.info(
+          `Crawling: Page ${pageCounter} / ${config.maxPagesToCrawl} - URL: ${request.loadedUrl}...`
+        );
+
+        // Use custom handling for XPath selector
+        if (config.selector) {
+          if (config.selector.startsWith("/")) {
+            await waitForXPath(
+              page,
+              config.selector,
+              config.waitForSelectorTimeout ?? 1000
+            );
+          } else {
+            await page.waitForSelector(config.selector, {
+              timeout: config.waitForSelectorTimeout ?? 1000,
+            });
+          }
+        }
+
+        const html = await getPageHtml(page, config.selector);
+
+        // Save results as JSON to ./storage/datasets/default
+        await pushData({ title, url: request.loadedUrl, html });
+
+        if (config.onVisitPage) {
+          await config.onVisitPage({ page, pushData });
+        }
+
+        // Extract links from the current page
+        // and add them to the crawling queue.
+        await enqueueLinks({
+          globs:
+            typeof config.match === "string" ? [config.match] : config.match,
+        });
+      },
+      // Comment this option to scrape the full website.
+      maxRequestsPerCrawl: config.maxPagesToCrawl,
+      // Uncomment this option to see the browser window.
+      // headless: false,
+    });
+
+    // Add first URL to the queue and start the crawl.
+    await crawler.run([config.url]);
+  }
+}
+
+export async function write(config: Config) {
+  const jsonFiles = await glob("storage/datasets/default/*.json", {
+    absolute: true,
+  });
+
+  const results = [];
+  for (const file of jsonFiles) {
+    const data = JSON.parse(await readFile(file, "utf-8"));
+    results.push(data);
+  }
+
+  await writeFile(config.outputFileName, JSON.stringify(results, null, 2));
+}

+ 4 - 114
src/main.ts

@@ -1,115 +1,5 @@
-// For more information, see https://crawlee.dev/
-import { PlaywrightCrawler, downloadListOfUrls } from "crawlee";
-import { readFile, writeFile } from "fs/promises";
-import { glob } from "glob";
-import { config } from "../config.js";
-import { Page } from "playwright";
+import { defaultConfig } from "../config.js";
+import { crawl, write } from "./core.js";
 
-export function getPageHtml(page: Page) {
-  return page.evaluate((selector) => {
-    const el = document.querySelector(selector) as HTMLElement | null;
-    // If the selector is not found, fall back to the body
-    const defaultSelector = "body";
-    if (!el) {
-      console.warn(
-        `Selector "${selector}" not found, falling back to "${defaultSelector}"`
-      );
-    }
-    return el?.innerText ?? document.querySelector(defaultSelector)?.innerText;
-  }, config.selector);
-}
-
-if (process.env.NO_CRAWL !== "true") {
-  // PlaywrightCrawler crawls the web using a headless
-  // browser controlled by the Playwright library.
-  const RESOURCE_EXCLUSTIONS = config.resourceExclusions ?? [];
-  const crawler = new PlaywrightCrawler({
-    // Use the requestHandler to process each of the crawled pages.
-    async requestHandler({ request, page, enqueueLinks, log, pushData }) {
-
-      if (config.cookie) {
-        // Set the cookie for the specific URL
-        const cookie = {
-          name: config.cookie.name,
-          value: config.cookie.value,
-          url: request.loadedUrl,
-        };
-        await page.context().addCookies([cookie]);
-      }
-
-      const title = await page.title();
-      log.info(`Crawling ${request.loadedUrl}...`);
-
-      // Wait for the selector to appear on the page
-      async function waitForSelectorOrFallback(page: Page, selector: string, fallbackSelector: string, timeout: number) {
-        try {
-          await page.waitForSelector(selector, { timeout });
-        } catch (e) {
-          // If the selector is not found, fall back to the fallbackSelector
-          log.warning(`Selector "${selector}" not found, Falling back to "${fallbackSelector}"`);
-          await page.waitForSelector(fallbackSelector, { timeout });
-        }
-      }
-
-      await waitForSelectorOrFallback(page, config.selector, "body", config.waitForSelectorTimeout ?? 1000);
-
-      const html = await getPageHtml(page);
-
-      // Save results as JSON to ./storage/datasets/default
-      await pushData({ title, url: request.loadedUrl, html });
-
-      if (config.onVisitPage) {
-        await config.onVisitPage({ page, pushData });
-      }
-
-      // Extract links from the current page
-      // and add them to the crawling queue.
-      await enqueueLinks({
-        globs: [config.match],
-      });
-    },
-    // Comment this option to scrape the full website.
-    maxRequestsPerCrawl: config.maxPagesToCrawl,
-    // Uncomment this option to see the browser window.
-    // headless: false,
-    preNavigationHooks: [
-      // Abort requests for certain resource types
-      async ({ page, log }) => {
-        // If there are no resource exclusions, return
-        if (RESOURCE_EXCLUSTIONS.length === 0) {
-          return;
-        }
-        await page.route(`**\/*.{${RESOURCE_EXCLUSTIONS.join()}}`, route => route.abort());
-        log.info(`Aborting requests for as this is a resource excluded route`);
-      }
-    ],
-  });
-
-  const SITEMAP_SUFFIX = "sitemap.xml";
-  const isUrlASitemap = config.url.endsWith(SITEMAP_SUFFIX);
-
-  if (isUrlASitemap) {
-    const listOfUrls = await downloadListOfUrls({ url: config.url });
-
-    // Add the initial URL to the crawling queue.
-    await crawler.addRequests(listOfUrls);
-
-    // Run the crawler
-    await crawler.run();
-  } else {
-    // Add first URL to the queue and start the crawl.
-    await crawler.run([config.url]);
-  }
-}
-
-const jsonFiles = await glob("storage/datasets/default/*.json", {
-  absolute: true,
-});
-
-const results = [];
-for (const file of jsonFiles) {
-  const data = JSON.parse(await readFile(file, "utf-8"));
-  results.push(data);
-}
-
-await writeFile(config.outputFileName, JSON.stringify(results, null, 2));
+await crawl(defaultConfig);
+await write(defaultConfig);

+ 2 - 1
tsconfig.json

@@ -6,7 +6,8 @@
     "outDir": "dist",
     "resolveJsonModule": true,
     "noUnusedLocals": false,
+    "skipLibCheck": true,
     "lib": ["DOM"]
   },
-  "include": ["./src/**/*", "./config.ts"]
+  "include": ["./src/**/*", "config.ts"]
 }