1 anno fa · 98a645ac1e
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -20,4 +20,4 @@ jobs:
 
				       - run: npm run build
			
 
				       - uses: preactjs/compressed-size-action@v2
			
 
				         with:
			
 
				-          pattern: ".dist/**/*.{js,ts,json}"
			
 
				+          pattern: ".dist/**/*.{js,ts,json}"
			
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -13,11 +13,11 @@ jobs:
 
				       - uses: actions/checkout@v2
			
 
				       - uses: actions/setup-node@v2
			
 
				         with:
			
 
				-            cache: npm
			
 
				-            node-version: 18
			
 
				+          cache: npm
			
 
				+          node-version: 18
			
 
				       - run: npm i
			
 
				       - run: npm run build
			
 
				       - run: npm run semantic-release
			
 
				         env:
			
 
				           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
			
 
				-          NPM_TOKEN: ${{ secrets.NPM_TOKEN }}
			
 
				+          NPM_TOKEN: ${{ secrets.NPM_TOKEN }}
			
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -0,0 +1,18 @@
 
				+name: Test workflow
			
 
				+
			
 
				+on: [push, pull_request]
			
 
				+
			
 
				+jobs:
			
 
				+  prettier_check:
			
 
				+    runs-on: ubuntu-latest
			
 
				+
			
 
				+    steps:
			
 
				+      - uses: actions/checkout@v3
			
 
				+      - name: Set up Node.js
			
 
				+        uses: actions/setup-node@v2
			
 
				+        with:
			
 
				+          node-version: "20"
			
 
				+      - name: Install Dependencies
			
 
				+        run: npm ci
			
 
				+      - name: Run prettier
			
 
				+        run: npm run prettier:check
			
--- a/README.md
+++ b/README.md
--- a/containerapp/data/config.ts
+++ b/containerapp/data/config.ts
@@ -5,4 +5,4 @@ export const defaultConfig: Config = {
 
				   match: "https://www.builder.io/c/docs/**",
			
 
				   maxPagesToCrawl: 50,
			
 
				   outputFileName: "../data/output.json",
			
 
				-};
			
 
				+};
			
--- a/package-lock.json
+++ b/package-lock.json
@@ -1,12 +1,12 @@
 
				 {
			
 
				   "name": "@builder.io/gpt-crawler",
			
 
				-  "version": "1.0.0",
			
 
				+  "version": "1.1.0",
			
 
				   "lockfileVersion": 3,
			
 
				   "requires": true,
			
 
				   "packages": {
			
 
				     "": {
			
 
				       "name": "@builder.io/gpt-crawler",
			
 
				-      "version": "1.0.0",
			
 
				+      "version": "1.1.0",
			
 
				       "hasInstallScript": true,
			
 
				       "license": "ISC",
			
 
				       "dependencies": {
			
@@ -17,7 +17,6 @@
 
				         "gpt-tokenizer": "^2.1.2",
			
 
				         "inquirer": "^9.2.12",
			
 
				         "playwright": "*",
			
 
				-        "prettier": "^3.1.0",
			
 
				         "zod": "^3.22.4"
			
 
				       },
			
 
				       "bin": {
			
@@ -29,6 +28,7 @@
 
				         "@semantic-release/git": "^10.0.1",
			
 
				         "@types/inquirer": "^9.0.7",
			
 
				         "@types/node": "^20.0.0",
			
 
				+        "prettier": "^3.1.0",
			
 
				         "semantic-release": "^22.0.8",
			
 
				         "ts-node": "^10.8.0",
			
 
				         "typescript": "^5.0.0"
			
@@ -8261,6 +8261,7 @@
 
				       "version": "3.1.0",
			
 
				       "resolved": "https://registry.npmjs.org/prettier/-/prettier-3.1.0.tgz",
			
 
				       "integrity": "sha512-TQLvXjq5IAibjh8EpBIkNKxO749UEWABoiIZehEPiY4GNpVdhaFKqSTu+QrlU6D2dPAfubRmtJTi4K4YkQ5eXw==",
			
 
				+      "dev": true,
			
 
				       "bin": {
			
 
				         "prettier": "bin/prettier.cjs"
			
 
				       },
			
--- a/package.json
+++ b/package.json
@@ -1,6 +1,6 @@
 
				 {
			
 
				   "name": "@builder.io/gpt-crawler",
			
 
				-  "version": "1.0.0",
			
 
				+  "version": "1.1.0",
			
 
				   "type": "module",
			
 
				   "bin": {
			
 
				     "gpt-crawler": "./dist/src/cli.js"
			
@@ -14,7 +14,6 @@
 
				     "gpt-tokenizer": "^2.1.2",
			
 
				     "inquirer": "^9.2.12",
			
 
				     "playwright": "*",
			
 
				-    "prettier": "^3.1.0",
			
 
				     "zod": "^3.22.4"
			
 
				   },
			
 
				   "devDependencies": {
			
@@ -23,6 +22,7 @@
 
				     "@semantic-release/git": "^10.0.1",
			
 
				     "@types/inquirer": "^9.0.7",
			
 
				     "@types/node": "^20.0.0",
			
 
				+    "prettier": "^3.1.0",
			
 
				     "semantic-release": "^22.0.8",
			
 
				     "ts-node": "^10.8.0",
			
 
				     "typescript": "^5.0.0"
			
@@ -35,7 +35,8 @@
 
				     "start:dev": "cross-env NODE_ENV=development npm run build && node dist/src/main.js",
			
 
				     "start:prod": "node dist/src/main.js",
			
 
				     "build": "tsc",
			
 
				-    "fmt": "prettier --write ."
			
 
				+    "fmt": "prettier --write .",
			
 
				+    "prettier:check": "prettier --check ."
			
 
				   },
			
 
				   "author": "It's not you it's me",
			
 
				   "license": "ISC"
			
--- a/src/config.ts
+++ b/src/config.ts
--- a/src/core.ts
+++ b/src/core.ts
@@ -2,7 +2,7 @@
 
				 import { PlaywrightCrawler, downloadListOfUrls } from "crawlee";
			
 
				 import { readFile, writeFile } from "fs/promises";
			
 
				 import { glob } from "glob";
			
 
				-import {Config, configSchema} from "./config.js";
			
 
				+import { Config, configSchema } from "./config.js";
			
 
				 import { Page } from "playwright";
			
 
				 import {
			
 
				   isWithinTokenLimit,
			
@@ -19,7 +19,7 @@ export function getPageHtml(page: Page, selector = "body") {
 
				         document,
			
 
				         null,
			
 
				         XPathResult.ANY_TYPE,
			
 
				-        null
			
 
				+        null,
			
 
				       );
			
 
				       let result = elements.iterateNext();
			
 
				       return result ? result.textContent || "" : "";
			
@@ -39,16 +39,16 @@ export async function waitForXPath(page: Page, xpath: string, timeout: number) {
 
				         document,
			
 
				         null,
			
 
				         XPathResult.ANY_TYPE,
			
 
				-        null
			
 
				+        null,
			
 
				       );
			
 
				       return elements.iterateNext() !== null;
			
 
				     },
			
 
				     xpath,
			
 
				-    { timeout }
			
 
				+    { timeout },
			
 
				   );
			
 
				 }
			
 
				 
			
 
				-export async function crawl(config: Config) { 
			
 
				+export async function crawl(config: Config) {
			
 
				   configSchema.parse(config);
			
 
				 
			
 
				   if (process.env.NO_CRAWL !== "true") {
			
@@ -70,7 +70,7 @@ export async function crawl(config: Config) {
 
				         const title = await page.title();
			
 
				         pageCounter++;
			
 
				         log.info(
			
 
				-          `Crawling: Page ${pageCounter} / ${config.maxPagesToCrawl} - URL: ${request.loadedUrl}...`
			
 
				+          `Crawling: Page ${pageCounter} / ${config.maxPagesToCrawl} - URL: ${request.loadedUrl}...`,
			
 
				         );
			
 
				 
			
 
				         // Use custom handling for XPath selector
			
@@ -79,7 +79,7 @@ export async function crawl(config: Config) {
 
				             await waitForXPath(
			
 
				               page,
			
 
				               config.selector,
			
 
				-              config.waitForSelectorTimeout ?? 1000
			
 
				+              config.waitForSelectorTimeout ?? 1000,
			
 
				             );
			
 
				           } else {
			
 
				             await page.waitForSelector(config.selector, {
			
@@ -116,21 +116,25 @@ export async function crawl(config: Config) {
 
				           if (RESOURCE_EXCLUSTIONS.length === 0) {
			
 
				             return;
			
 
				           }
			
 
				-          await page.route(`**\/*.{${RESOURCE_EXCLUSTIONS.join()}}`, route => route.abort('aborted'));
			
 
				-          log.info(`Aborting requests for as this is a resource excluded route`);
			
 
				-        }
			
 
				+          await page.route(`**\/*.{${RESOURCE_EXCLUSTIONS.join()}}`, (route) =>
			
 
				+            route.abort("aborted"),
			
 
				+          );
			
 
				+          log.info(
			
 
				+            `Aborting requests for as this is a resource excluded route`,
			
 
				+          );
			
 
				+        },
			
 
				       ],
			
 
				     });
			
 
				 
			
 
				     const SITEMAP_SUFFIX = "sitemap.xml";
			
 
				     const isUrlASitemap = config.url.endsWith(SITEMAP_SUFFIX);
			
 
				-  
			
 
				+
			
 
				     if (isUrlASitemap) {
			
 
				       const listOfUrls = await downloadListOfUrls({ url: config.url });
			
 
				-  
			
 
				+
			
 
				       // Add the initial URL to the crawling queue.
			
 
				       await crawler.addRequests(listOfUrls);
			
 
				-  
			
 
				+
			
 
				       // Run the crawler
			
 
				       await crawler.run();
			
 
				     } else {