2 лет назад · c41abe41dc
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -20,4 +20,4 @@ jobs:
 
				       - run: npm run build
			
 
				       - uses: preactjs/compressed-size-action@v2
			
 
				         with:
			
 
				-          pattern: ".dist/**/*.{js,ts,json}"
			
 
				+          pattern: ".dist/**/*.{js,ts,json}"
			
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -13,11 +13,11 @@ jobs:
 
				       - uses: actions/checkout@v2
			
 
				       - uses: actions/setup-node@v2
			
 
				         with:
			
 
				-            cache: npm
			
 
				-            node-version: 18
			
 
				+          cache: npm
			
 
				+          node-version: 18
			
 
				       - run: npm i
			
 
				       - run: npm run build
			
 
				       - run: npm run semantic-release
			
 
				         env:
			
 
				           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
			
 
				-          NPM_TOKEN: ${{ secrets.NPM_TOKEN }}
			
 
				+          NPM_TOKEN: ${{ secrets.NPM_TOKEN }}
			
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -11,7 +11,7 @@ jobs:
 
				       - name: Set up Node.js
			
 
				         uses: actions/setup-node@v2
			
 
				         with:
			
 
				-          node-version: '20'
			
 
				+          node-version: "20"
			
 
				       - name: Install Dependencies
			
 
				         run: npm ci
			
 
				       - name: Run prettier
			
--- a/README.md
+++ b/README.md
--- a/containerapp/data/config.ts
+++ b/containerapp/data/config.ts
@@ -5,4 +5,4 @@ export const defaultConfig: Config = {
 
				   match: "https://www.builder.io/c/docs/**",
			
 
				   maxPagesToCrawl: 50,
			
 
				   outputFileName: "../data/output.json",
			
 
				-};
			
 
				+};
			
--- a/src/config.ts
+++ b/src/config.ts
--- a/src/core.ts
+++ b/src/core.ts
@@ -2,7 +2,7 @@
 
				 import { PlaywrightCrawler, downloadListOfUrls } from "crawlee";
			
 
				 import { readFile, writeFile } from "fs/promises";
			
 
				 import { glob } from "glob";
			
 
				-import {Config, configSchema} from "./config.js";
			
 
				+import { Config, configSchema } from "./config.js";
			
 
				 import { Page } from "playwright";
			
 
				 
			
 
				 let pageCounter = 0;
			
@@ -16,7 +16,7 @@ export function getPageHtml(page: Page, selector = "body") {
 
				         document,
			
 
				         null,
			
 
				         XPathResult.ANY_TYPE,
			
 
				-        null
			
 
				+        null,
			
 
				       );
			
 
				       let result = elements.iterateNext();
			
 
				       return result ? result.textContent || "" : "";
			
@@ -36,16 +36,16 @@ export async function waitForXPath(page: Page, xpath: string, timeout: number) {
 
				         document,
			
 
				         null,
			
 
				         XPathResult.ANY_TYPE,
			
 
				-        null
			
 
				+        null,
			
 
				       );
			
 
				       return elements.iterateNext() !== null;
			
 
				     },
			
 
				     xpath,
			
 
				-    { timeout }
			
 
				+    { timeout },
			
 
				   );
			
 
				 }
			
 
				 
			
 
				-export async function crawl(config: Config) { 
			
 
				+export async function crawl(config: Config) {
			
 
				   configSchema.parse(config);
			
 
				 
			
 
				   if (process.env.NO_CRAWL !== "true") {
			
@@ -67,7 +67,7 @@ export async function crawl(config: Config) {
 
				         const title = await page.title();
			
 
				         pageCounter++;
			
 
				         log.info(
			
 
				-          `Crawling: Page ${pageCounter} / ${config.maxPagesToCrawl} - URL: ${request.loadedUrl}...`
			
 
				+          `Crawling: Page ${pageCounter} / ${config.maxPagesToCrawl} - URL: ${request.loadedUrl}...`,
			
 
				         );
			
 
				 
			
 
				         // Use custom handling for XPath selector
			
@@ -76,7 +76,7 @@ export async function crawl(config: Config) {
 
				             await waitForXPath(
			
 
				               page,
			
 
				               config.selector,
			
 
				-              config.waitForSelectorTimeout ?? 1000
			
 
				+              config.waitForSelectorTimeout ?? 1000,
			
 
				             );
			
 
				           } else {
			
 
				             await page.waitForSelector(config.selector, {
			
@@ -113,21 +113,25 @@ export async function crawl(config: Config) {
 
				           if (RESOURCE_EXCLUSTIONS.length === 0) {
			
 
				             return;
			
 
				           }
			
 
				-          await page.route(`**\/*.{${RESOURCE_EXCLUSTIONS.join()}}`, route => route.abort('aborted'));
			
 
				-          log.info(`Aborting requests for as this is a resource excluded route`);
			
 
				-        }
			
 
				+          await page.route(`**\/*.{${RESOURCE_EXCLUSTIONS.join()}}`, (route) =>
			
 
				+            route.abort("aborted"),
			
 
				+          );
			
 
				+          log.info(
			
 
				+            `Aborting requests for as this is a resource excluded route`,
			
 
				+          );
			
 
				+        },
			
 
				       ],
			
 
				     });
			
 
				 
			
 
				     const SITEMAP_SUFFIX = "sitemap.xml";
			
 
				     const isUrlASitemap = config.url.endsWith(SITEMAP_SUFFIX);
			
 
				-  
			
 
				+
			
 
				     if (isUrlASitemap) {
			
 
				       const listOfUrls = await downloadListOfUrls({ url: config.url });
			
 
				-  
			
 
				+
			
 
				       // Add the initial URL to the crawling queue.
			
 
				       await crawler.addRequests(listOfUrls);
			
 
				-  
			
 
				+
			
 
				       // Run the crawler
			
 
				       await crawler.run();
			
 
				     } else {