Steve Sewell 1 年之前
父節點
當前提交
ae80eb88dd
共有 4 個文件被更改,包括 25 次插入20 次删除
  1. 6 6
      config.ts
  2. 7 0
      forum.json
  3. 2 2
      src/main.ts
  4. 10 12
      tsconfig.json

+ 6 - 6
config.ts

@@ -1,15 +1,15 @@
 type Config = {
   url: string;
-  glob: string;
+  match: string;
   selector: string;
   maxPagesToCrawl: number;
   outputFileName: string;
 };
 
 export const config = {
-  url: "https://www.builder.io/c/docs/developer",
-  glob: "https://www.builder.io/c/docs/*",
-  selector: ".docs-builder-container",
-  maxPagesToCrawl: 500,
-  outputFileName: "output.json",
+  url: "https://forum.builder.io",
+  match: "https://forum.builder.io/t/**",
+  selector: ".posts-wrapper",
+  maxPagesToCrawl: 1000,
+  outputFileName: "forum.json",
 } satisfies Config;

+ 7 - 0
forum.json

@@ -0,0 +1,7 @@
+[
+  {
+    "title": "Builder.io Forum - Help and tips for Builder.io",
+    "url": "https://forum.builder.io/",
+    "html": ""
+  }
+]

+ 2 - 2
src/main.ts

@@ -2,7 +2,7 @@
 import { PlaywrightCrawler } from "crawlee";
 import { readFile, writeFile } from "fs/promises";
 import { glob } from "glob";
-import { config } from "../config";
+import { config } from "../config.js";
 
 if (process.env.NO_CRAWL !== "true") {
   // PlaywrightCrawler crawls the web using a headless
@@ -27,7 +27,7 @@ if (process.env.NO_CRAWL !== "true") {
       // Extract links from the current page
       // and add them to the crawling queue.
       await enqueueLinks({
-        globs: [config.glob],
+        globs: [config.match],
       });
     },
     // Comment this option to scrape the full website.

+ 10 - 12
tsconfig.json

@@ -1,14 +1,12 @@
 {
-    "extends": "@apify/tsconfig",
-    "compilerOptions": {
-        "module": "ES2022",
-        "target": "ES2022",
-        "outDir": "dist",
-        "resolveJsonModule": true,
-        "noUnusedLocals": false,
-        "lib": ["DOM"]
-    },
-    "include": [
-        "./src/**/*"
-    ]
+  "extends": "@apify/tsconfig",
+  "compilerOptions": {
+    "module": "ES2022",
+    "target": "ES2022",
+    "outDir": "dist",
+    "resolveJsonModule": true,
+    "noUnusedLocals": false,
+    "lib": ["DOM"]
+  },
+  "include": ["./src/**/*", "./config.ts"]
 }