Explorar o código

Merge pull request #36 from Umar-Azam/main

Added option for simple containerized execution
Steve Sewell hai 1 ano
pai
achega
d713d1c56c
Modificáronse 6 ficheiros con 110 adicións e 0 borrados
  1. 2 0
      README.md
  2. 35 0
      containerapp/Dockerfile
  3. 15 0
      containerapp/README.md
  4. 31 0
      containerapp/data/config.ts
  5. 11 0
      containerapp/data/init.sh
  6. 16 0
      containerapp/run.sh

+ 2 - 0
README.md

@@ -116,6 +116,8 @@ Use this option for API access to your generated knowledge that you can integrat
 
 ![Gif of how to upload to an assistant](https://github.com/BuilderIO/gpt-crawler/assets/844291/06e6ad36-e2ba-4c6e-8d5a-bf329140de49)
 
+## (Alternate method) Running in a container with Docker
+To obtain the `output.json` with a containerized execution. Go into the `containerapp` directory. Modify the `config.ts` same as above, the `output.json`file should be generated in the data folder. Note : the `outputFileName` property in the `config.ts` file in containerapp folder is configured to work with the container. 
 
 
 ## Contributing

+ 35 - 0
containerapp/Dockerfile

@@ -0,0 +1,35 @@
+FROM ubuntu:jammy
+
+# Install Git
+RUN apt-get update && \
+    apt-get install sudo -y && \
+    apt-get install git -y
+
+# Install Docker
+RUN apt-get install ca-certificates curl gnupg -y && \
+    curl -fsSL https://download.docker.com/linux/ubuntu/gpg | gpg --dearmor -o /etc/apt/keyrings/docker.gpg && \
+    echo "deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.gpg] https://download.docker.com/linux/ubuntu $(. /etc/os-release && echo "$VERSION_CODENAME") stable" | tee /etc/apt/sources.list.d/docker.list > /dev/null && \
+    apt-get update && \
+    apt-get install docker-ce docker-ce-cli containerd.io docker-buildx-plugin docker-compose-plugin -y
+
+# Install Nodejs v20 npm
+RUN sudo apt-get update && \
+    sudo apt-get install -y ca-certificates curl gnupg && \
+    sudo mkdir -p /etc/apt/keyrings && \
+    curl -fsSL https://deb.nodesource.com/gpgkey/nodesource-repo.gpg.key | sudo gpg --dearmor -o /etc/apt/keyrings/nodesource.gpg 
+
+RUN echo "deb [signed-by=/etc/apt/keyrings/nodesource.gpg] https://deb.nodesource.com/node_20.x nodistro main" | sudo tee /etc/apt/sources.list.d/nodesource.list && \
+    sudo apt-get update && \
+    sudo apt-get install nodejs -y
+
+# Install gpt-crawler
+RUN cd /home && git clone https://github.com/builderio/gpt-crawler && cd gpt-crawler && \
+    npm i && \
+    npx playwright install && \
+    npx playwright install-deps
+
+# Directory to mount in the docker container to get the output.json data
+RUN cd /home && mkdir data
+
+
+WORKDIR /home

+ 15 - 0
containerapp/README.md

@@ -0,0 +1,15 @@
+# Containerized crawler
+## Docker image with packaged crawler, with script for building and execution.
+
+
+All dependencies set up and configured in the Dockerfile. Requires docker to be installed.
+
+
+## Get started
+
+### Prerequisites
+
+Be sure you have docker installed
+
+1. ``` cd gpt-crawler/containerapp  ```
+2. ``` . ./run.sh  ```

+ 31 - 0
containerapp/data/config.ts

@@ -0,0 +1,31 @@
+import { Page } from "playwright";
+
+type Config = {
+  /** URL to start the crawl */
+  url: string;
+  /** Pattern to match against for links on a page to subsequently crawl */
+  match: string;
+  /** Selector to grab the inner text from */
+  selector: string;
+  /** Don't crawl more than this many pages */
+  maxPagesToCrawl: number;
+  /** File name for the finished data */
+  outputFileName: string;
+  /** Optional cookie to be set. E.g. for Cookie Consent */
+  cookie?: {name: string; value: string}
+  /** Optional function to run for each page found */
+  onVisitPage?: (options: {
+    page: Page;
+    pushData: (data: any) => Promise<void>;
+  }) => Promise<void>;
+    /** Optional timeout for waiting for a selector to appear */
+  waitForSelectorTimeout?: number;
+};
+
+export const config: Config = {
+  url: "https://www.builder.io/c/docs/developers",
+  match: "https://www.builder.io/c/docs/**",
+  selector: `.docs-builder-container`,
+  maxPagesToCrawl: 50,
+  outputFileName: "../data/output.json",
+};

+ 11 - 0
containerapp/data/init.sh

@@ -0,0 +1,11 @@
+#!/bin/bash
+
+# copy the config when starting the container
+cp /home/data/config.ts /home/gpt-crawler/
+
+# start the crawler
+cd /home/gpt-crawler && npm start
+
+# Print message after crawling and exit
+echo "Crawling complete.."
+exit

+ 16 - 0
containerapp/run.sh

@@ -0,0 +1,16 @@
+#!/bin/bash
+
+# Check if there is a Docker image named "crawler"
+if ! sudo docker images | grep -w 'crawler' > /dev/null; then
+    echo "Docker repository 'crawler' not found. Building the image..."
+    # Build the Docker image with the name 'crawler'
+    sudo docker build -t crawler .
+else
+    echo "Docker image already built."
+fi
+
+# Ensure that init.sh script is executable
+sudo chmod +x ./data/init.sh
+
+# Starting docker, mount docker.sock to work with docker-in-docker function, mount data directory for input/output from container
+sudo docker run --rm -it -v /var/run/docker.sock:/var/run/docker.sock -v ./data:/home/data crawler bash -c "/home/data/init.sh"