Pārlūkot izejas kodu

chore: make cli execution default

marcelovicentegc 1 gadu atpakaļ
vecāks
revīzija
ade21571e5
8 mainītis faili ar 96 papildinājumiem un 110 dzēšanām
  1. 0 51
      Dockerfile
  2. 2 3
      package.json
  3. 69 35
      src/cli.ts
  4. 23 5
      config.ts
  5. 1 1
      src/core.ts
  6. 0 13
      src/hardcoded.ts
  7. 0 1
      src/main.ts
  8. 1 1
      tsconfig.json

+ 0 - 51
Dockerfile

@@ -1,51 +0,0 @@
-# Specify the base Docker image. You can read more about
-# the available images at https://crawlee.dev/docs/guides/docker-images
-# You can also use any other image from Docker Hub.
-FROM apify/actor-node-playwright-chrome:18 AS builder
-
-# Copy just package.json and package-lock.json
-# to speed up the build using Docker layer cache.
-COPY --chown=myuser package*.json ./
-
-# Install all dependencies. Don't audit to speed up the installation.
-RUN bun install --include=dev --audit=false
-
-# Next, copy the source files using the user set
-# in the base image.
-COPY --chown=myuser . ./
-
-# Install all dependencies and build the project.
-# Don't audit to speed up the installation.
-RUN bun run build
-
-# Create final image
-FROM apify/actor-node-playwright-chrome:18
-
-# Copy only built JS files from builder image
-COPY --from=builder --chown=myuser /home/myuser/dist ./dist
-
-# Copy just package.json and package-lock.json
-# to speed up the build using Docker layer cache.
-COPY --chown=myuser package*.json ./
-
-# Install NPM packages, skip optional and development dependencies to
-# keep the image small. Avoid logging too much and print the dependency
-# tree for debugging
-RUN bun --quiet set progress=false \
-    && bun install --omit=dev --omit=optional \
-    && echo "Installed NPM packages:" \
-    && (bun list --omit=dev --all || true) \
-    && echo "Node.js version:" \
-    && node --version \
-    && echo "NPM version:" \
-    && bun --version
-
-# Next, copy the remaining files and directories with the source code.
-# Since we do this after NPM install, quick build will be really fast
-# for most source file changes.
-COPY --chown=myuser . ./
-
-
-# Run the image. If you know you won't need headful browsers,
-# you can remove the XVFB start script for a micro perf gain.
-CMD ./start_xvfb_and_run_cmd.sh && bun run start:prod --silent

+ 2 - 3
package.json

@@ -23,9 +23,8 @@
   "scripts": {
     "preinstall": "bunx playwright install",
     "start": "bun run start:dev",
-    "start:cli": "NODE_ENV=development bun run build && node dist/src/cli.js",
-    "start:prod": "node dist/src/main.js",
-    "start:dev": "bun run build && node --no-warnings=ExperimentalWarning dist/src/main.js",
+    "start:dev": "NODE_ENV=development bun run build && node dist/cli.js",
+    "start:prod": "node dist/main.js",
     "build": "tsc"
   },
   "author": "It's not you it's me",

+ 69 - 35
src/cli.ts

@@ -1,47 +1,77 @@
 #!/usr/bin/env node
 
 import { program } from "commander";
-import { Config } from "../config.js";
+import { Config } from "./config.js";
 import { crawl, write } from "./core.js";
 import { createRequire } from "node:module";
 import inquirer from "inquirer";
 
 const require = createRequire(import.meta.url);
-const { version, description } = require("../../package.json");
+const { version, description } = require("../package.json");
 
-async function handler(options: any) {
+const messages = {
+  url: "What is the first URL of the website you want to crawl?",
+  match: "What is the URL pattern you want to match?",
+  selector: "What is the CSS selector you want to match?",
+  maxPagesToCrawl: "How many pages do you want to crawl?",
+  outputFileName: "What is the name of the output file?",
+};
+
+async function handler(options: Config) {
   try {
+    const {
+      url,
+      match,
+      selector,
+      maxPagesToCrawl: maxPagesToCrawlStr,
+      outputFileName,
+    } = options;
+
+    // @ts-ignore
+    const maxPagesToCrawl = parseInt(maxPagesToCrawlStr, 10);
+
     let config: Config = {
-      url: options.url,
-      match: options.match,
-      selector: options.selector,
-      maxPagesToCrawl: 50,
-      outputFileName: options.outputFileName ?? "output.json",
+      url,
+      match,
+      selector,
+      maxPagesToCrawl,
+      outputFileName,
     };
 
     if (!config.url || !config.match || !config.selector) {
-      const { url, match, selector } = await inquirer
-        .prompt([
-          {
-            type: "input",
-            name: "url",
-            message: "What is the URL of the website you want to crawl?",
-          },
-          {
-            type: "input",
-            name: "match",
-            message: "What is the URL pattern you want to match?",
-          },
-          {
-            type: "input",
-            name: "selector",
-            message: "What is the CSS selector you want to match?",
-          },
-        ]);
-
-      config.url = url;
-      config.match = match;
-      config.selector = selector;
+      const questions = [];
+
+      if (!config.url) {
+        questions.push({
+          type: "input",
+          name: "url",
+          message: messages.url,
+        });
+      }
+
+      if (!config.match) {
+        questions.push({
+          type: "input",
+          name: "match",
+          message: messages.match,
+        });
+      }
+
+      if (!config.selector) {
+        questions.push({
+          type: "input",
+          name: "selector",
+          message: messages.selector,
+        });
+      }
+
+      const answers = await inquirer
+        .prompt(questions);
+
+      config = {
+        ...config,
+        ...answers,
+      };
     }
 
     await crawl(config);
@@ -56,11 +86,15 @@ program
   .description(description);
 
 program
-  .option("-u, --url")
-  .option("-m, --match")
-  .option("-s, --selector")
-  .option("-m, --maxPagesToCrawl")
-  .option("-o, --outputFileName")
+  .option("-u, --url <string>", messages.url, "")
+  .option("-m, --match <string>", messages.match, "")
+  .option("-s, --selector <string>", messages.selector, "")
+  .option("-m, --maxPagesToCrawl <number>", messages.maxPagesToCrawl, "50")
+  .option(
+    "-o, --outputFileName <string>",
+    messages.outputFileName,
+    "output.json",
+  )
   .action(handler);
 
 program.parse();

+ 23 - 5
config.ts

@@ -1,15 +1,33 @@
 import { Page } from "playwright";
 
 export type Config = {
-  /** URL to start the crawl */
+  /**
+   * URL to start the crawl
+   * @example "https://www.builder.io/c/docs/developers"
+   * @default ""
+   */
   url: string;
-  /** Pattern to match against for links on a page to subsequently crawl */
+  /**
+   * Pattern to match against for links on a page to subsequently crawl
+   * @example "https://www.builder.io/c/docs/**"
+   * @default ""
+   */
   match: string | string[];
-  /** Selector to grab the inner text from */
+  /**
+   * Selector to grab the inner text from
+   * @example ".docs-builder-container"
+   * @default ""
+   */
   selector: string;
-  /** Don't crawl more than this many pages */
+  /**
+   * Don't crawl more than this many pages
+   * @default 50
+   */
   maxPagesToCrawl: number;
-  /** File name for the finished data */
+  /**
+   * File name for the finished data
+   * @default "output.json"
+   */
   outputFileName: string;
   /** Optional cookie to be set. E.g. for Cookie Consent */
   cookie?: { name: string; value: string };

+ 1 - 1
src/core.ts

@@ -2,7 +2,7 @@
 import { PlaywrightCrawler } from "crawlee";
 import { readFile, writeFile } from "fs/promises";
 import { glob } from "glob";
-import { Config } from "../config";
+import { Config } from "./config";
 import { Page } from "playwright";
 
 let pageCounter = 0; 

+ 0 - 13
src/hardcoded.ts

@@ -1,13 +0,0 @@
-import { Config } from "../config.js";
-import { crawl, write } from "./core.js";
-
-const hardcodedConfig: Config = {
-  url: "https://www.builder.io/c/docs/developers",
-  match: "https://www.builder.io/c/docs/**",
-  selector: `.docs-builder-container`,
-  maxPagesToCrawl: 50,
-  outputFileName: "output.json",
-};
-
-await crawl(hardcodedConfig);
-await write(hardcodedConfig);

+ 0 - 1
src/main.ts

@@ -1,3 +1,2 @@
 export * from "./core.js";
 export * from "./cli.js";
-export * from "./hardcoded.js";

+ 1 - 1
tsconfig.json

@@ -9,5 +9,5 @@
     "skipLibCheck": true,
     "lib": ["DOM"]
   },
-  "include": ["./src/**/*", "./config.ts"]
+  "include": ["./src/**/*", "src/config.ts"]
 }