瀏覽代碼

Merge pull request #54 from iperzic/type-validation

Config validation
Steve Sewell 1 年之前
父節點
當前提交
550f1e6b5d
共有 5 個文件被更改,包括 48 次插入17 次删除
  1. 10 1
      package-lock.json
  2. 2 1
      package.json
  3. 29 13
      src/config.ts
  4. 5 1
      src/core.ts
  5. 2 1
      tsconfig.json

+ 10 - 1
package-lock.json

@@ -16,7 +16,8 @@
         "glob": "^10.3.10",
         "inquirer": "^9.2.12",
         "playwright": "*",
-        "prettier": "^3.1.0"
+        "prettier": "^3.1.0",
+        "zod": "^3.22.4"
       },
       "bin": {
         "gpt-crawler": "dist/src/cli.js"
@@ -3494,6 +3495,14 @@
       "funding": {
         "url": "https://github.com/sponsors/sindresorhus"
       }
+    },
+    "node_modules/zod": {
+      "version": "3.22.4",
+      "resolved": "https://registry.npmjs.org/zod/-/zod-3.22.4.tgz",
+      "integrity": "sha512-iC+8Io04lddc+mVqQ9AZ7OQ2MrUKGN+oIQyq1vemgt46jwCwLfhq7/pwnBnNXXXZb8VTVLKwp9EDkx+ryxIWmg==",
+      "funding": {
+        "url": "https://github.com/sponsors/colinhacks"
+      }
     }
   }
 }

+ 2 - 1
package.json

@@ -13,7 +13,8 @@
     "glob": "^10.3.10",
     "inquirer": "^9.2.12",
     "playwright": "*",
-    "prettier": "^3.1.0"
+    "prettier": "^3.1.0",
+    "zod": "^3.22.4"
   },
   "devDependencies": {
     "@apify/tsconfig": "^0.1.0",

+ 29 - 13
src/config.ts

@@ -1,41 +1,57 @@
+import { z } from 'zod';
+
 import type { Page } from "playwright";
 
-export type Config = {
+const Page: z.ZodType<Page> = z.any();
+
+export const configSchema = z.object({
   /**
    * URL to start the crawl
    * @example "https://www.builder.io/c/docs/developers"
    * @default ""
    */
-  url: string;
+  url: z.string(),
   /**
    * Pattern to match against for links on a page to subsequently crawl
    * @example "https://www.builder.io/c/docs/**"
    * @default ""
    */
-  match: string | string[];
+  match: z.string().or(z.array(z.string())),
+
   /**
    * Selector to grab the inner text from
    * @example ".docs-builder-container"
    * @default ""
    */
-  selector?: string;
+  selector: z.string().optional(),
   /**
    * Don't crawl more than this many pages
    * @default 50
    */
-  maxPagesToCrawl: number;
+  maxPagesToCrawl: z.number().int().positive(),
   /**
    * File name for the finished data
    * @default "output.json"
    */
-  outputFileName: string;
+  outputFileName: z.string(),
   /** Optional cookie to be set. E.g. for Cookie Consent */
-  cookie?: { name: string; value: string };
+  cookie: z.object({
+    name: z.string(),
+    value: z.string(),
+  }).optional(),
   /** Optional function to run for each page found */
-  onVisitPage?: (options: {
-    page: Page;
-    pushData: (data: any) => Promise<void>;
-  }) => Promise<void>;
+  onVisitPage: z.function()
+      .args(z.object({
+        page: Page,
+        pushData: z.function()
+            .args(z.any())
+            .returns(z.promise(z.void()))
+      }))
+      .returns(z.promise(z.void()))
+      .optional(),
   /** Optional timeout for waiting for a selector to appear */
-  waitForSelectorTimeout?: number;
-};
+  waitForSelectorTimeout: z.number().int().nonnegative().optional(),
+});
+
+export type Config = z.infer<typeof configSchema>;
+

+ 5 - 1
src/core.ts

@@ -2,7 +2,7 @@
 import { PlaywrightCrawler } from "crawlee";
 import { readFile, writeFile } from "fs/promises";
 import { glob } from "glob";
-import { Config } from "./config.js";
+import {Config, configSchema} from "./config.js";
 import { Page } from "playwright";
 
 let pageCounter = 0;
@@ -46,6 +46,8 @@ export async function waitForXPath(page: Page, xpath: string, timeout: number) {
 }
 
 export async function crawl(config: Config) {
+  configSchema.parse(config);
+
   if (process.env.NO_CRAWL !== "true") {
     // PlaywrightCrawler crawls the web using a headless
     // browser controlled by the Playwright library.
@@ -111,6 +113,8 @@ export async function crawl(config: Config) {
 }
 
 export async function write(config: Config) {
+  configSchema.parse(config);
+
   const jsonFiles = await glob("storage/datasets/default/*.json", {
     absolute: true,
   });

+ 2 - 1
tsconfig.json

@@ -7,7 +7,8 @@
     "resolveJsonModule": true,
     "noUnusedLocals": false,
     "skipLibCheck": true,
-    "lib": ["DOM"]
+    "lib": ["DOM"],
+    "strict": true,
   },
   "include": ["./src/**/*", "config.ts"]
 }