Sfoglia il codice sorgente

Merge remote-tracking branch 'upstream/main' into sitemap-support

guillermoscript 1 anno fa
parent
commit
3935d68378
7 ha cambiato i file con 75 aggiunte e 51 eliminazioni
  1. 3 0
      .gitignore
  2. 3 26
      containerapp/data/config.ts
  3. 28 1
      package-lock.json
  4. 5 3
      package.json
  5. 29 19
      src/config.ts
  6. 5 1
      src/core.ts
  7. 2 1
      tsconfig.json

+ 3 - 0
.gitignore

@@ -6,6 +6,9 @@ node_modules
 apify_storage
 crawlee_storage
 storage
+.DS_Store
+
+!package.json
 
 # any output from the crawler
 *.json

+ 3 - 26
containerapp/data/config.ts

@@ -1,31 +1,8 @@
-import { Page } from "playwright";
+import { Config } from "./src/config";
 
-type Config = {
-  /** URL to start the crawl */
-  url: string;
-  /** Pattern to match against for links on a page to subsequently crawl */
-  match: string;
-  /** Selector to grab the inner text from */
-  selector: string;
-  /** Don't crawl more than this many pages */
-  maxPagesToCrawl: number;
-  /** File name for the finished data */
-  outputFileName: string;
-  /** Optional cookie to be set. E.g. for Cookie Consent */
-  cookie?: { name: string; value: string };
-  /** Optional function to run for each page found */
-  onVisitPage?: (options: {
-    page: Page;
-    pushData: (data: any) => Promise<void>;
-  }) => Promise<void>;
-  /** Optional timeout for waiting for a selector to appear */
-  waitForSelectorTimeout?: number;
-};
-
-export const config: Config = {
+export const defaultConfig: Config = {
   url: "https://www.builder.io/c/docs/developers",
   match: "https://www.builder.io/c/docs/**",
-  selector: `.docs-builder-container`,
   maxPagesToCrawl: 50,
   outputFileName: "../data/output.json",
-};
+};

+ 28 - 1
package-lock.json

@@ -12,10 +12,12 @@
       "dependencies": {
         "commander": "^11.1.0",
         "crawlee": "^3.0.0",
+        "cross-env": "^7.0.3",
         "glob": "^10.3.10",
         "inquirer": "^9.2.12",
         "playwright": "*",
-        "prettier": "^3.1.0"
+        "prettier": "^3.1.0",
+        "zod": "^3.22.4"
       },
       "bin": {
         "gpt-crawler": "dist/src/cli.js"
@@ -1342,6 +1344,23 @@
       "dev": true,
       "license": "MIT"
     },
+    "node_modules/cross-env": {
+      "version": "7.0.3",
+      "resolved": "https://registry.npmjs.org/cross-env/-/cross-env-7.0.3.tgz",
+      "integrity": "sha512-+/HKd6EgcQCJGh2PSjZuUitQBQynKor4wrFbRg4DtAgS1aWO+gU52xpH7M9ScGgXSYmAVS9bIJ8EzuaGw0oNAw==",
+      "dependencies": {
+        "cross-spawn": "^7.0.1"
+      },
+      "bin": {
+        "cross-env": "src/bin/cross-env.js",
+        "cross-env-shell": "src/bin/cross-env-shell.js"
+      },
+      "engines": {
+        "node": ">=10.14",
+        "npm": ">=6",
+        "yarn": ">=1"
+      }
+    },
     "node_modules/cross-spawn": {
       "version": "7.0.3",
       "license": "MIT",
@@ -3476,6 +3495,14 @@
       "funding": {
         "url": "https://github.com/sponsors/sindresorhus"
       }
+    },
+    "node_modules/zod": {
+      "version": "3.22.4",
+      "resolved": "https://registry.npmjs.org/zod/-/zod-3.22.4.tgz",
+      "integrity": "sha512-iC+8Io04lddc+mVqQ9AZ7OQ2MrUKGN+oIQyq1vemgt46jwCwLfhq7/pwnBnNXXXZb8VTVLKwp9EDkx+ryxIWmg==",
+      "funding": {
+        "url": "https://github.com/sponsors/colinhacks"
+      }
     }
   }
 }

+ 5 - 3
package.json

@@ -9,10 +9,12 @@
   "dependencies": {
     "commander": "^11.1.0",
     "crawlee": "^3.0.0",
+    "cross-env": "^7.0.3",
     "glob": "^10.3.10",
     "inquirer": "^9.2.12",
     "playwright": "*",
-    "prettier": "^3.1.0"
+    "prettier": "^3.1.0",
+    "zod": "^3.22.4"
   },
   "devDependencies": {
     "@apify/tsconfig": "^0.1.0",
@@ -24,8 +26,8 @@
   "scripts": {
     "preinstall": "npx playwright install",
     "start": "npm run start:dev",
-    "start:cli": "NODE_ENV=development npm run build && node dist/src/cli.js",
-    "start:dev": "NODE_ENV=development npm run build && node dist/src/main.js",
+    "start:cli": "cross-env NODE_ENV=development npm run build && node dist/src/cli.js",
+    "start:dev": "cross-env NODE_ENV=development npm run build && node dist/src/main.js",
     "start:prod": "node dist/main.js",
     "build": "tsc",
     "fmt": "prettier --write ."

File diff suppressed because it is too large
+ 29 - 19
src/config.ts


+ 5 - 1
src/core.ts

@@ -2,7 +2,7 @@
 import { PlaywrightCrawler, downloadListOfUrls } from "crawlee";
 import { readFile, writeFile } from "fs/promises";
 import { glob } from "glob";
-import { Config } from "./config.js";
+import {Config, configSchema} from "./config.js";
 import { Page } from "playwright";
 
 let pageCounter = 0;
@@ -46,6 +46,8 @@ export async function waitForXPath(page: Page, xpath: string, timeout: number) {
 }
 
 export async function crawl(config: Config) { 
+  configSchema.parse(config);
+
   if (process.env.NO_CRAWL !== "true") {
     // PlaywrightCrawler crawls the web using a headless
     // browser controlled by the Playwright library.
@@ -136,6 +138,8 @@ export async function crawl(config: Config) {
 }
 
 export async function write(config: Config) {
+  configSchema.parse(config);
+
   const jsonFiles = await glob("storage/datasets/default/*.json", {
     absolute: true,
   });

+ 2 - 1
tsconfig.json

@@ -7,7 +7,8 @@
     "resolveJsonModule": true,
     "noUnusedLocals": false,
     "skipLibCheck": true,
-    "lib": ["DOM"]
+    "lib": ["DOM"],
+    "strict": true,
   },
   "include": ["./src/**/*", "config.ts"]
 }