2 år sedan · a855e667de
--- a/.dockerignore
+++ b/.dockerignore
@@ -0,0 +1,10 @@
 
				+# configurations
			
 
				+.idea
			
 
				+
			
 
				+# crawlee and apify storage folders
			
 
				+apify_storage
			
 
				+crawlee_storage
			
 
				+storage
			
 
				+
			
 
				+# installed files
			
 
				+node_modules
			
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1,8 @@
 
				+# This file tells Git which files shouldn't be added to source control
			
 
				+
			
 
				+.idea
			
 
				+dist
			
 
				+node_modules
			
 
				+apify_storage
			
 
				+crawlee_storage
			
 
				+storage
			
--- a/Dockerfile
+++ b/Dockerfile
@@ -0,0 +1,51 @@
 
				+# Specify the base Docker image. You can read more about
			
 
				+# the available images at https://crawlee.dev/docs/guides/docker-images
			
 
				+# You can also use any other image from Docker Hub.
			
 
				+FROM apify/actor-node-playwright-chrome:18 AS builder
			
 
				+
			
 
				+# Copy just package.json and package-lock.json
			
 
				+# to speed up the build using Docker layer cache.
			
 
				+COPY --chown=myuser package*.json ./
			
 
				+
			
 
				+# Install all dependencies. Don't audit to speed up the installation.
			
 
				+RUN npm install --include=dev --audit=false
			
 
				+
			
 
				+# Next, copy the source files using the user set
			
 
				+# in the base image.
			
 
				+COPY --chown=myuser . ./
			
 
				+
			
 
				+# Install all dependencies and build the project.
			
 
				+# Don't audit to speed up the installation.
			
 
				+RUN npm run build
			
 
				+
			
 
				+# Create final image
			
 
				+FROM apify/actor-node-playwright-chrome:18
			
 
				+
			
 
				+# Copy only built JS files from builder image
			
 
				+COPY --from=builder --chown=myuser /home/myuser/dist ./dist
			
 
				+
			
 
				+# Copy just package.json and package-lock.json
			
 
				+# to speed up the build using Docker layer cache.
			
 
				+COPY --chown=myuser package*.json ./
			
 
				+
			
 
				+# Install NPM packages, skip optional and development dependencies to
			
 
				+# keep the image small. Avoid logging too much and print the dependency
			
 
				+# tree for debugging
			
 
				+RUN npm --quiet set progress=false \
			
 
				+    && npm install --omit=dev --omit=optional \
			
 
				+    && echo "Installed NPM packages:" \
			
 
				+    && (npm list --omit=dev --all || true) \
			
 
				+    && echo "Node.js version:" \
			
 
				+    && node --version \
			
 
				+    && echo "NPM version:" \
			
 
				+    && npm --version
			
 
				+
			
 
				+# Next, copy the remaining files and directories with the source code.
			
 
				+# Since we do this after NPM install, quick build will be really fast
			
 
				+# for most source file changes.
			
 
				+COPY --chown=myuser . ./
			
 
				+
			
 
				+
			
 
				+# Run the image. If you know you won't need headful browsers,
			
 
				+# you can remove the XVFB start script for a micro perf gain.
			
 
				+CMD ./start_xvfb_and_run_cmd.sh && npm run start:prod --silent
			
--- a/README.md
+++ b/README.md
@@ -0,0 +1,9 @@
 
				+# Getting started with Crawlee
			
 
				+
			
 
				+This example uses `PlaywrightCrawler` to recursively crawl https://crawlee.dev using the browser automation library [Playwright](https://playwright.dev).
			
 
				+
			
 
				+You can find more examples and documentation at the following links:
			
 
				+
			
 
				+- [Step-by-step tutorial](https://crawlee.dev/docs/introduction) for Crawlee
			
 
				+- `PlaywrightCrawler` [API documentation](https://crawlee.dev/api/playwright-crawler/class/PlaywrightCrawler)
			
 
				+- Other [examples](https://crawlee.dev/docs/examples/playwright-crawler)
			
--- a/config.ts
+++ b/config.ts
@@ -0,0 +1,15 @@
 
				+type Config = {
			
 
				+  url: string;
			
 
				+  glob: string;
			
 
				+  selector: string;
			
 
				+  maxPagesToCrawl: number;
			
 
				+  outputFileName: string;
			
 
				+};
			
 
				+
			
 
				+export const config = {
			
 
				+  url: "https://www.builder.io/c/docs/developer",
			
 
				+  glob: "https://www.builder.io/c/docs/*",
			
 
				+  selector: ".docs-builder-container",
			
 
				+  maxPagesToCrawl: 500,
			
 
				+  outputFileName: "output.json",
			
 
				+} satisfies Config;
			
--- a/output.json
+++ b/output.json
--- a/package-lock.json
+++ b/package-lock.json
--- a/package.json
+++ b/package.json
@@ -0,0 +1,26 @@
 
				+{
			
 
				+    "name": "gpt-crawler",
			
 
				+    "version": "0.0.1",
			
 
				+    "type": "module",
			
 
				+    "description": "This is an example of a Crawlee project.",
			
 
				+    "dependencies": {
			
 
				+        "crawlee": "^3.0.0",
			
 
				+        "glob": "^10.3.10",
			
 
				+        "playwright": "*"
			
 
				+    },
			
 
				+    "devDependencies": {
			
 
				+        "@apify/tsconfig": "^0.1.0",
			
 
				+        "@types/node": "^20.0.0",
			
 
				+        "ts-node": "^10.8.0",
			
 
				+        "typescript": "^5.0.0"
			
 
				+    },
			
 
				+    "scripts": {
			
 
				+        "start": "npm run start:dev",
			
 
				+        "start:prod": "node dist/main.js",
			
 
				+        "start:dev": "node --no-warnings=ExperimentalWarning --loader ts-node/esm/transpile-only src/main.ts",
			
 
				+        "build": "tsc",
			
 
				+        "test": "echo \"Error: oops, the actor has no tests yet, sad!\" && exit 1"
			
 
				+    },
			
 
				+    "author": "It's not you it's me",
			
 
				+    "license": "ISC"
			
 
				+}
			
--- a/src/main.ts
+++ b/src/main.ts
@@ -0,0 +1,53 @@
 
				+// For more information, see https://crawlee.dev/
			
 
				+import { PlaywrightCrawler } from "crawlee";
			
 
				+import { readFile, writeFile } from "fs/promises";
			
 
				+import { glob } from "glob";
			
 
				+import { config } from "../config";
			
 
				+
			
 
				+if (process.env.NO_CRAWL !== "true") {
			
 
				+  // PlaywrightCrawler crawls the web using a headless
			
 
				+  // browser controlled by the Playwright library.
			
 
				+  const crawler = new PlaywrightCrawler({
			
 
				+    // Use the requestHandler to process each of the crawled pages.
			
 
				+    async requestHandler({ request, page, enqueueLinks, log, pushData }) {
			
 
				+      const title = await page.title();
			
 
				+      log.info(`Title of ${request.loadedUrl} is '${title}'`);
			
 
				+
			
 
				+      const html = await page.evaluate(() => {
			
 
				+        const el = document.querySelector(
			
 
				+          ".docs-builder-container"
			
 
				+        ) as HTMLElement | null;
			
 
				+
			
 
				+        return el?.innerText || "";
			
 
				+      });
			
 
				+
			
 
				+      // Save results as JSON to ./storage/datasets/default
			
 
				+      await pushData({ title, url: request.loadedUrl, html });
			
 
				+
			
 
				+      // Extract links from the current page
			
 
				+      // and add them to the crawling queue.
			
 
				+      await enqueueLinks({
			
 
				+        globs: [config.glob],
			
 
				+      });
			
 
				+    },
			
 
				+    // Comment this option to scrape the full website.
			
 
				+    maxRequestsPerCrawl: config.maxPagesToCrawl,
			
 
				+    // Uncomment this option to see the browser window.
			
 
				+    // headless: false,
			
 
				+  });
			
 
				+
			
 
				+  // Add first URL to the queue and start the crawl.
			
 
				+  await crawler.run([config.url]);
			
 
				+}
			
 
				+
			
 
				+const jsonFiles = await glob("storage/datasets/default/*.json", {
			
 
				+  absolute: true,
			
 
				+});
			
 
				+
			
 
				+const results = [];
			
 
				+for (const file of jsonFiles) {
			
 
				+  const data = JSON.parse(await readFile(file, "utf-8"));
			
 
				+  results.push(data);
			
 
				+}
			
 
				+
			
 
				+await writeFile(config.outputFileName, JSON.stringify(results, null, 2));
			
--- a/tsconfig.json
+++ b/tsconfig.json
@@ -0,0 +1,14 @@
 
				+{
			
 
				+    "extends": "@apify/tsconfig",
			
 
				+    "compilerOptions": {
			
 
				+        "module": "ES2022",
			
 
				+        "target": "ES2022",
			
 
				+        "outDir": "dist",
			
 
				+        "resolveJsonModule": true,
			
 
				+        "noUnusedLocals": false,
			
 
				+        "lib": ["DOM"]
			
 
				+    },
			
 
				+    "include": [
			
 
				+        "./src/**/*"
			
 
				+    ]
			
 
				+}