Sfoglia il codice sorgente

Merge branch 'main' into multiple-files

guillermoscript 1 anno fa
parent
commit
98a645ac1e

+ 1 - 1
.github/workflows/build.yml

@@ -20,4 +20,4 @@ jobs:
       - run: npm run build
       - uses: preactjs/compressed-size-action@v2
         with:
-          pattern: ".dist/**/*.{js,ts,json}"
+          pattern: ".dist/**/*.{js,ts,json}"

+ 3 - 3
.github/workflows/release.yml

@@ -13,11 +13,11 @@ jobs:
       - uses: actions/checkout@v2
       - uses: actions/setup-node@v2
         with:
-            cache: npm
-            node-version: 18
+          cache: npm
+          node-version: 18
       - run: npm i
       - run: npm run build
       - run: npm run semantic-release
         env:
           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-          NPM_TOKEN: ${{ secrets.NPM_TOKEN }}
+          NPM_TOKEN: ${{ secrets.NPM_TOKEN }}

+ 18 - 0
.github/workflows/test.yml

@@ -0,0 +1,18 @@
+name: Test workflow
+
+on: [push, pull_request]
+
+jobs:
+  prettier_check:
+    runs-on: ubuntu-latest
+
+    steps:
+      - uses: actions/checkout@v3
+      - name: Set up Node.js
+        uses: actions/setup-node@v2
+        with:
+          node-version: "20"
+      - name: Install Dependencies
+        run: npm ci
+      - name: Run prettier
+        run: npm run prettier:check

File diff suppressed because it is too large
+ 5 - 5
README.md


+ 1 - 1
containerapp/data/config.ts

@@ -5,4 +5,4 @@ export const defaultConfig: Config = {
   match: "https://www.builder.io/c/docs/**",
   maxPagesToCrawl: 50,
   outputFileName: "../data/output.json",
-};
+};

+ 4 - 3
package-lock.json

@@ -1,12 +1,12 @@
 {
   "name": "@builder.io/gpt-crawler",
-  "version": "1.0.0",
+  "version": "1.1.0",
   "lockfileVersion": 3,
   "requires": true,
   "packages": {
     "": {
       "name": "@builder.io/gpt-crawler",
-      "version": "1.0.0",
+      "version": "1.1.0",
       "hasInstallScript": true,
       "license": "ISC",
       "dependencies": {
@@ -17,7 +17,6 @@
         "gpt-tokenizer": "^2.1.2",
         "inquirer": "^9.2.12",
         "playwright": "*",
-        "prettier": "^3.1.0",
         "zod": "^3.22.4"
       },
       "bin": {
@@ -29,6 +28,7 @@
         "@semantic-release/git": "^10.0.1",
         "@types/inquirer": "^9.0.7",
         "@types/node": "^20.0.0",
+        "prettier": "^3.1.0",
         "semantic-release": "^22.0.8",
         "ts-node": "^10.8.0",
         "typescript": "^5.0.0"
@@ -8261,6 +8261,7 @@
       "version": "3.1.0",
       "resolved": "https://registry.npmjs.org/prettier/-/prettier-3.1.0.tgz",
       "integrity": "sha512-TQLvXjq5IAibjh8EpBIkNKxO749UEWABoiIZehEPiY4GNpVdhaFKqSTu+QrlU6D2dPAfubRmtJTi4K4YkQ5eXw==",
+      "dev": true,
       "bin": {
         "prettier": "bin/prettier.cjs"
       },

+ 4 - 3
package.json

@@ -1,6 +1,6 @@
 {
   "name": "@builder.io/gpt-crawler",
-  "version": "1.0.0",
+  "version": "1.1.0",
   "type": "module",
   "bin": {
     "gpt-crawler": "./dist/src/cli.js"
@@ -14,7 +14,6 @@
     "gpt-tokenizer": "^2.1.2",
     "inquirer": "^9.2.12",
     "playwright": "*",
-    "prettier": "^3.1.0",
     "zod": "^3.22.4"
   },
   "devDependencies": {
@@ -23,6 +22,7 @@
     "@semantic-release/git": "^10.0.1",
     "@types/inquirer": "^9.0.7",
     "@types/node": "^20.0.0",
+    "prettier": "^3.1.0",
     "semantic-release": "^22.0.8",
     "ts-node": "^10.8.0",
     "typescript": "^5.0.0"
@@ -35,7 +35,8 @@
     "start:dev": "cross-env NODE_ENV=development npm run build && node dist/src/main.js",
     "start:prod": "node dist/src/main.js",
     "build": "tsc",
-    "fmt": "prettier --write ."
+    "fmt": "prettier --write .",
+    "prettier:check": "prettier --check ."
   },
   "author": "It's not you it's me",
   "license": "ISC"

File diff suppressed because it is too large
+ 21 - 19
src/config.ts


+ 17 - 13
src/core.ts

@@ -2,7 +2,7 @@
 import { PlaywrightCrawler, downloadListOfUrls } from "crawlee";
 import { readFile, writeFile } from "fs/promises";
 import { glob } from "glob";
-import {Config, configSchema} from "./config.js";
+import { Config, configSchema } from "./config.js";
 import { Page } from "playwright";
 import {
   isWithinTokenLimit,
@@ -19,7 +19,7 @@ export function getPageHtml(page: Page, selector = "body") {
         document,
         null,
         XPathResult.ANY_TYPE,
-        null
+        null,
       );
       let result = elements.iterateNext();
       return result ? result.textContent || "" : "";
@@ -39,16 +39,16 @@ export async function waitForXPath(page: Page, xpath: string, timeout: number) {
         document,
         null,
         XPathResult.ANY_TYPE,
-        null
+        null,
       );
       return elements.iterateNext() !== null;
     },
     xpath,
-    { timeout }
+    { timeout },
   );
 }
 
-export async function crawl(config: Config) { 
+export async function crawl(config: Config) {
   configSchema.parse(config);
 
   if (process.env.NO_CRAWL !== "true") {
@@ -70,7 +70,7 @@ export async function crawl(config: Config) {
         const title = await page.title();
         pageCounter++;
         log.info(
-          `Crawling: Page ${pageCounter} / ${config.maxPagesToCrawl} - URL: ${request.loadedUrl}...`
+          `Crawling: Page ${pageCounter} / ${config.maxPagesToCrawl} - URL: ${request.loadedUrl}...`,
         );
 
         // Use custom handling for XPath selector
@@ -79,7 +79,7 @@ export async function crawl(config: Config) {
             await waitForXPath(
               page,
               config.selector,
-              config.waitForSelectorTimeout ?? 1000
+              config.waitForSelectorTimeout ?? 1000,
             );
           } else {
             await page.waitForSelector(config.selector, {
@@ -116,21 +116,25 @@ export async function crawl(config: Config) {
           if (RESOURCE_EXCLUSTIONS.length === 0) {
             return;
           }
-          await page.route(`**\/*.{${RESOURCE_EXCLUSTIONS.join()}}`, route => route.abort('aborted'));
-          log.info(`Aborting requests for as this is a resource excluded route`);
-        }
+          await page.route(`**\/*.{${RESOURCE_EXCLUSTIONS.join()}}`, (route) =>
+            route.abort("aborted"),
+          );
+          log.info(
+            `Aborting requests for as this is a resource excluded route`,
+          );
+        },
       ],
     });
 
     const SITEMAP_SUFFIX = "sitemap.xml";
     const isUrlASitemap = config.url.endsWith(SITEMAP_SUFFIX);
-  
+
     if (isUrlASitemap) {
       const listOfUrls = await downloadListOfUrls({ url: config.url });
-  
+
       // Add the initial URL to the crawling queue.
       await crawler.addRequests(listOfUrls);
-  
+
       // Run the crawler
       await crawler.run();
     } else {