Steve Sewell 1 gadu atpakaļ
revīzija
a855e667de
10 mainītis faili ar 5188 papildinājumiem un 0 dzēšanām
  1. 10 0
      .dockerignore
  2. 8 0
      .gitignore
  3. 51 0
      Dockerfile
  4. 9 0
      README.md
  5. 15 0
      config.ts
  6. 1397 0
      output.json
  7. 3605 0
      package-lock.json
  8. 26 0
      package.json
  9. 53 0
      src/main.ts
  10. 14 0
      tsconfig.json

+ 10 - 0
.dockerignore

@@ -0,0 +1,10 @@
+# configurations
+.idea
+
+# crawlee and apify storage folders
+apify_storage
+crawlee_storage
+storage
+
+# installed files
+node_modules

+ 8 - 0
.gitignore

@@ -0,0 +1,8 @@
+# This file tells Git which files shouldn't be added to source control
+
+.idea
+dist
+node_modules
+apify_storage
+crawlee_storage
+storage

+ 51 - 0
Dockerfile

@@ -0,0 +1,51 @@
+# Specify the base Docker image. You can read more about
+# the available images at https://crawlee.dev/docs/guides/docker-images
+# You can also use any other image from Docker Hub.
+FROM apify/actor-node-playwright-chrome:18 AS builder
+
+# Copy just package.json and package-lock.json
+# to speed up the build using Docker layer cache.
+COPY --chown=myuser package*.json ./
+
+# Install all dependencies. Don't audit to speed up the installation.
+RUN npm install --include=dev --audit=false
+
+# Next, copy the source files using the user set
+# in the base image.
+COPY --chown=myuser . ./
+
+# Install all dependencies and build the project.
+# Don't audit to speed up the installation.
+RUN npm run build
+
+# Create final image
+FROM apify/actor-node-playwright-chrome:18
+
+# Copy only built JS files from builder image
+COPY --from=builder --chown=myuser /home/myuser/dist ./dist
+
+# Copy just package.json and package-lock.json
+# to speed up the build using Docker layer cache.
+COPY --chown=myuser package*.json ./
+
+# Install NPM packages, skip optional and development dependencies to
+# keep the image small. Avoid logging too much and print the dependency
+# tree for debugging
+RUN npm --quiet set progress=false \
+    && npm install --omit=dev --omit=optional \
+    && echo "Installed NPM packages:" \
+    && (npm list --omit=dev --all || true) \
+    && echo "Node.js version:" \
+    && node --version \
+    && echo "NPM version:" \
+    && npm --version
+
+# Next, copy the remaining files and directories with the source code.
+# Since we do this after NPM install, quick build will be really fast
+# for most source file changes.
+COPY --chown=myuser . ./
+
+
+# Run the image. If you know you won't need headful browsers,
+# you can remove the XVFB start script for a micro perf gain.
+CMD ./start_xvfb_and_run_cmd.sh && npm run start:prod --silent

+ 9 - 0
README.md

@@ -0,0 +1,9 @@
+# Getting started with Crawlee
+
+This example uses `PlaywrightCrawler` to recursively crawl https://crawlee.dev using the browser automation library [Playwright](https://playwright.dev).
+
+You can find more examples and documentation at the following links:
+
+- [Step-by-step tutorial](https://crawlee.dev/docs/introduction) for Crawlee
+- `PlaywrightCrawler` [API documentation](https://crawlee.dev/api/playwright-crawler/class/PlaywrightCrawler)
+- Other [examples](https://crawlee.dev/docs/examples/playwright-crawler)

+ 15 - 0
config.ts

@@ -0,0 +1,15 @@
+type Config = {
+  url: string;
+  glob: string;
+  selector: string;
+  maxPagesToCrawl: number;
+  outputFileName: string;
+};
+
+export const config = {
+  url: "https://www.builder.io/c/docs/developer",
+  glob: "https://www.builder.io/c/docs/*",
+  selector: ".docs-builder-container",
+  maxPagesToCrawl: 500,
+  outputFileName: "output.json",
+} satisfies Config;

Failā izmaiņas netiks attēlotas, jo tās ir par lielu
+ 1397 - 0
output.json


Failā izmaiņas netiks attēlotas, jo tās ir par lielu
+ 3605 - 0
package-lock.json


+ 26 - 0
package.json

@@ -0,0 +1,26 @@
+{
+    "name": "gpt-crawler",
+    "version": "0.0.1",
+    "type": "module",
+    "description": "This is an example of a Crawlee project.",
+    "dependencies": {
+        "crawlee": "^3.0.0",
+        "glob": "^10.3.10",
+        "playwright": "*"
+    },
+    "devDependencies": {
+        "@apify/tsconfig": "^0.1.0",
+        "@types/node": "^20.0.0",
+        "ts-node": "^10.8.0",
+        "typescript": "^5.0.0"
+    },
+    "scripts": {
+        "start": "npm run start:dev",
+        "start:prod": "node dist/main.js",
+        "start:dev": "node --no-warnings=ExperimentalWarning --loader ts-node/esm/transpile-only src/main.ts",
+        "build": "tsc",
+        "test": "echo \"Error: oops, the actor has no tests yet, sad!\" && exit 1"
+    },
+    "author": "It's not you it's me",
+    "license": "ISC"
+}

+ 53 - 0
src/main.ts

@@ -0,0 +1,53 @@
+// For more information, see https://crawlee.dev/
+import { PlaywrightCrawler } from "crawlee";
+import { readFile, writeFile } from "fs/promises";
+import { glob } from "glob";
+import { config } from "../config";
+
+if (process.env.NO_CRAWL !== "true") {
+  // PlaywrightCrawler crawls the web using a headless
+  // browser controlled by the Playwright library.
+  const crawler = new PlaywrightCrawler({
+    // Use the requestHandler to process each of the crawled pages.
+    async requestHandler({ request, page, enqueueLinks, log, pushData }) {
+      const title = await page.title();
+      log.info(`Title of ${request.loadedUrl} is '${title}'`);
+
+      const html = await page.evaluate(() => {
+        const el = document.querySelector(
+          ".docs-builder-container"
+        ) as HTMLElement | null;
+
+        return el?.innerText || "";
+      });
+
+      // Save results as JSON to ./storage/datasets/default
+      await pushData({ title, url: request.loadedUrl, html });
+
+      // Extract links from the current page
+      // and add them to the crawling queue.
+      await enqueueLinks({
+        globs: [config.glob],
+      });
+    },
+    // Comment this option to scrape the full website.
+    maxRequestsPerCrawl: config.maxPagesToCrawl,
+    // Uncomment this option to see the browser window.
+    // headless: false,
+  });
+
+  // Add first URL to the queue and start the crawl.
+  await crawler.run([config.url]);
+}
+
+const jsonFiles = await glob("storage/datasets/default/*.json", {
+  absolute: true,
+});
+
+const results = [];
+for (const file of jsonFiles) {
+  const data = JSON.parse(await readFile(file, "utf-8"));
+  results.push(data);
+}
+
+await writeFile(config.outputFileName, JSON.stringify(results, null, 2));

+ 14 - 0
tsconfig.json

@@ -0,0 +1,14 @@
+{
+    "extends": "@apify/tsconfig",
+    "compilerOptions": {
+        "module": "ES2022",
+        "target": "ES2022",
+        "outDir": "dist",
+        "resolveJsonModule": true,
+        "noUnusedLocals": false,
+        "lib": ["DOM"]
+    },
+    "include": [
+        "./src/**/*"
+    ]
+}