فهرست منبع

Merge branch 'main' into multiple-files

guillermoscript 1 سال پیش
والد
کامیت
fa2d2ba56a
14فایلهای تغییر یافته به همراه8556 افزوده شده و 1926 حذف شده
  1. BIN
      .DS_Store
  2. 3 0
      .dockerignore
  3. 23 0
      .github/workflows/build.yml
  4. 23 0
      .github/workflows/release.yml
  5. 4 1
      .gitignore
  6. 12 0
      .releaserc
  7. 1 2
      Dockerfile
  8. 0 21
      LICENSE
  9. 15 0
      License
  10. 7 1
      README.md
  11. 8425 1893
      package-lock.json
  12. 6 2
      package.json
  13. 8 2
      src/config.ts
  14. 29 4
      src/core.ts

BIN
.DS_Store


+ 3 - 0
.dockerignore

@@ -8,3 +8,6 @@ storage
 
 # installed files
 node_modules
+
+# ignore base image 'main.js'
+main.js

+ 23 - 0
.github/workflows/build.yml

@@ -0,0 +1,23 @@
+name: Build workflow
+
+on:
+  pull_request:
+    types: [opened, reopened, synchronize]
+
+jobs:
+  build:
+    name: build
+    runs-on: ubuntu-latest
+    env:
+      CI_JOB_NUMBER: 1
+    steps:
+      - uses: actions/checkout@v2
+      - uses: actions/setup-node@v2
+        with:
+          cache: npm
+          node-version: 18
+      - run: npm i
+      - run: npm run build
+      - uses: preactjs/compressed-size-action@v2
+        with:
+          pattern: ".dist/**/*.{js,ts,json}"

+ 23 - 0
.github/workflows/release.yml

@@ -0,0 +1,23 @@
+name: Release workflow
+
+on:
+  push:
+    branches:
+      - main
+
+jobs:
+  release:
+    name: release
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v2
+      - uses: actions/setup-node@v2
+        with:
+            cache: npm
+            node-version: 18
+      - run: npm i
+      - run: npm run build
+      - run: npm run semantic-release
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+          NPM_TOKEN: ${{ secrets.NPM_TOKEN }}

+ 4 - 1
.gitignore

@@ -9,6 +9,9 @@ storage
 .DS_Store
 
 !package.json
+!package-lock.json
+!tsconfig.json
 
 # any output from the crawler
-*.json
+*.json
+pnpm-lock.yaml

+ 12 - 0
.releaserc

@@ -0,0 +1,12 @@
+{
+  "branches": [
+    "main"
+  ],
+  "plugins": [
+    "@semantic-release/commit-analyzer",
+    "@semantic-release/changelog",
+    "@semantic-release/npm",
+    "@semantic-release/git",
+    "@semantic-release/github"
+  ]
+}

+ 1 - 2
Dockerfile

@@ -45,7 +45,6 @@ RUN npm --quiet set progress=false \
 # for most source file changes.
 COPY --chown=myuser . ./
 
-
 # Run the image. If you know you won't need headful browsers,
 # you can remove the XVFB start script for a micro perf gain.
-CMD ./start_xvfb_and_run_cmd.sh && npm run start:prod --silent
+CMD ./start_xvfb_and_run_cmd.sh && npm run start:prod --silent

+ 0 - 21
LICENSE

@@ -1,21 +0,0 @@
-MIT License
-
-Copyright (c) 2023 BuilderIO
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in all
-copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.

+ 15 - 0
License

@@ -0,0 +1,15 @@
+ISC License
+
+Copyright (c) 2023 BuilderIO
+
+Permission to use, copy, modify, and/or distribute this software for any purpose
+with or without fee is hereby granted, provided that the above copyright notice
+and this permission notice appear in all copies.
+
+THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH
+REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
+FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT,
+INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
+OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
+TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF
+THIS SOFTWARE.

تفاوت فایلی نمایش داده نمی شود زیرا این فایل بسیار بزرگ است
+ 7 - 1
README.md


تفاوت فایلی نمایش داده نمی شود زیرا این فایل بسیار بزرگ است
+ 8425 - 1893
package-lock.json


+ 6 - 2
package.json

@@ -1,6 +1,6 @@
 {
   "name": "@builder.io/gpt-crawler",
-  "version": "0.0.1",
+  "version": "1.0.0",
   "type": "module",
   "bin": {
     "gpt-crawler": "./dist/src/cli.js"
@@ -19,17 +19,21 @@
   },
   "devDependencies": {
     "@apify/tsconfig": "^0.1.0",
+    "@semantic-release/changelog": "^6.0.3",
+    "@semantic-release/git": "^10.0.1",
     "@types/inquirer": "^9.0.7",
     "@types/node": "^20.0.0",
+    "semantic-release": "^22.0.8",
     "ts-node": "^10.8.0",
     "typescript": "^5.0.0"
   },
   "scripts": {
+    "semantic-release": "semantic-release",
     "preinstall": "npx playwright install",
     "start": "npm run start:dev",
     "start:cli": "cross-env NODE_ENV=development npm run build && node dist/src/cli.js",
     "start:dev": "cross-env NODE_ENV=development npm run build && node dist/src/main.js",
-    "start:prod": "node dist/main.js",
+    "start:prod": "node dist/src/main.js",
     "build": "tsc",
     "fmt": "prettier --write ."
   },

تفاوت فایلی نمایش داده نمی شود زیرا این فایل بسیار بزرگ است
+ 8 - 2
src/config.ts


+ 29 - 4
src/core.ts

@@ -1,5 +1,5 @@
 // For more information, see https://crawlee.dev/
-import { PlaywrightCrawler } from "crawlee";
+import { PlaywrightCrawler, downloadListOfUrls } from "crawlee";
 import { readFile, writeFile } from "fs/promises";
 import { glob } from "glob";
 import {Config, configSchema} from "./config.js";
@@ -48,7 +48,7 @@ export async function waitForXPath(page: Page, xpath: string, timeout: number) {
   );
 }
 
-export async function crawl(config: Config) {
+export async function crawl(config: Config) { 
   configSchema.parse(config);
 
   if (process.env.NO_CRAWL !== "true") {
@@ -108,10 +108,35 @@ export async function crawl(config: Config) {
       maxRequestsPerCrawl: config.maxPagesToCrawl,
       // Uncomment this option to see the browser window.
       // headless: false,
+      preNavigationHooks: [
+        // Abort requests for certain resource types
+        async ({ page, log }) => {
+          // If there are no resource exclusions, return
+          const RESOURCE_EXCLUSTIONS = config.resourceExclusions ?? [];
+          if (RESOURCE_EXCLUSTIONS.length === 0) {
+            return;
+          }
+          await page.route(`**\/*.{${RESOURCE_EXCLUSTIONS.join()}}`, route => route.abort('aborted'));
+          log.info(`Aborting requests for as this is a resource excluded route`);
+        }
+      ],
     });
 
-    // Add first URL to the queue and start the crawl.
-    await crawler.run([config.url]);
+    const SITEMAP_SUFFIX = "sitemap.xml";
+    const isUrlASitemap = config.url.endsWith(SITEMAP_SUFFIX);
+  
+    if (isUrlASitemap) {
+      const listOfUrls = await downloadListOfUrls({ url: config.url });
+  
+      // Add the initial URL to the crawling queue.
+      await crawler.addRequests(listOfUrls);
+  
+      // Run the crawler
+      await crawler.run();
+    } else {
+      // Add first URL to the queue and start the crawl.
+      await crawler.run([config.url]);
+    }
   }
 }