浏览代码

chore: format

kunal00000 1 年之前
父节点
当前提交
c41abe41dc
共有 7 个文件被更改,包括 48 次插入42 次删除
  1. 1 1
      .github/workflows/build.yml
  2. 3 3
      .github/workflows/release.yml
  3. 1 1
      .github/workflows/test.yml
  4. 4 4
      README.md
  5. 1 1
      containerapp/data/config.ts
  6. 21 19
      src/config.ts
  7. 17 13
      src/core.ts

+ 1 - 1
.github/workflows/build.yml

@@ -20,4 +20,4 @@ jobs:
       - run: npm run build
       - uses: preactjs/compressed-size-action@v2
         with:
-          pattern: ".dist/**/*.{js,ts,json}"
+          pattern: ".dist/**/*.{js,ts,json}"

+ 3 - 3
.github/workflows/release.yml

@@ -13,11 +13,11 @@ jobs:
       - uses: actions/checkout@v2
       - uses: actions/setup-node@v2
         with:
-            cache: npm
-            node-version: 18
+          cache: npm
+          node-version: 18
       - run: npm i
       - run: npm run build
       - run: npm run semantic-release
         env:
           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-          NPM_TOKEN: ${{ secrets.NPM_TOKEN }}
+          NPM_TOKEN: ${{ secrets.NPM_TOKEN }}

+ 1 - 1
.github/workflows/test.yml

@@ -11,7 +11,7 @@ jobs:
       - name: Set up Node.js
         uses: actions/setup-node@v2
         with:
-          node-version: '20'
+          node-version: "20"
       - name: Install Dependencies
         run: npm ci
       - name: Run prettier

文件差异内容过多而无法显示
+ 4 - 4
README.md


+ 1 - 1
containerapp/data/config.ts

@@ -5,4 +5,4 @@ export const defaultConfig: Config = {
   match: "https://www.builder.io/c/docs/**",
   maxPagesToCrawl: 50,
   outputFileName: "../data/output.json",
-};
+};

文件差异内容过多而无法显示
+ 21 - 19
src/config.ts


+ 17 - 13
src/core.ts

@@ -2,7 +2,7 @@
 import { PlaywrightCrawler, downloadListOfUrls } from "crawlee";
 import { readFile, writeFile } from "fs/promises";
 import { glob } from "glob";
-import {Config, configSchema} from "./config.js";
+import { Config, configSchema } from "./config.js";
 import { Page } from "playwright";
 
 let pageCounter = 0;
@@ -16,7 +16,7 @@ export function getPageHtml(page: Page, selector = "body") {
         document,
         null,
         XPathResult.ANY_TYPE,
-        null
+        null,
       );
       let result = elements.iterateNext();
       return result ? result.textContent || "" : "";
@@ -36,16 +36,16 @@ export async function waitForXPath(page: Page, xpath: string, timeout: number) {
         document,
         null,
         XPathResult.ANY_TYPE,
-        null
+        null,
       );
       return elements.iterateNext() !== null;
     },
     xpath,
-    { timeout }
+    { timeout },
   );
 }
 
-export async function crawl(config: Config) { 
+export async function crawl(config: Config) {
   configSchema.parse(config);
 
   if (process.env.NO_CRAWL !== "true") {
@@ -67,7 +67,7 @@ export async function crawl(config: Config) {
         const title = await page.title();
         pageCounter++;
         log.info(
-          `Crawling: Page ${pageCounter} / ${config.maxPagesToCrawl} - URL: ${request.loadedUrl}...`
+          `Crawling: Page ${pageCounter} / ${config.maxPagesToCrawl} - URL: ${request.loadedUrl}...`,
         );
 
         // Use custom handling for XPath selector
@@ -76,7 +76,7 @@ export async function crawl(config: Config) {
             await waitForXPath(
               page,
               config.selector,
-              config.waitForSelectorTimeout ?? 1000
+              config.waitForSelectorTimeout ?? 1000,
             );
           } else {
             await page.waitForSelector(config.selector, {
@@ -113,21 +113,25 @@ export async function crawl(config: Config) {
           if (RESOURCE_EXCLUSTIONS.length === 0) {
             return;
           }
-          await page.route(`**\/*.{${RESOURCE_EXCLUSTIONS.join()}}`, route => route.abort('aborted'));
-          log.info(`Aborting requests for as this is a resource excluded route`);
-        }
+          await page.route(`**\/*.{${RESOURCE_EXCLUSTIONS.join()}}`, (route) =>
+            route.abort("aborted"),
+          );
+          log.info(
+            `Aborting requests for as this is a resource excluded route`,
+          );
+        },
       ],
     });
 
     const SITEMAP_SUFFIX = "sitemap.xml";
     const isUrlASitemap = config.url.endsWith(SITEMAP_SUFFIX);
-  
+
     if (isUrlASitemap) {
       const listOfUrls = await downloadListOfUrls({ url: config.url });
-  
+
       // Add the initial URL to the crawling queue.
       await crawler.addRequests(listOfUrls);
-  
+
       // Run the crawler
       await crawler.run();
     } else {