1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950 |
- import { Page } from "playwright";
- export type Config = {
- /**
- * URL to start the crawl
- * @example "https://www.builder.io/c/docs/developers"
- * @default ""
- */
- url: string;
- /**
- * Pattern to match against for links on a page to subsequently crawl
- * @example "https://www.builder.io/c/docs/**"
- * @default ""
- */
- match: string | string[];
- /**
- * Selector to grab the inner text from
- * @example ".docs-builder-container"
- * @default ""
- */
- selector: string;
- /**
- * Don't crawl more than this many pages
- * @default 50
- */
- maxPagesToCrawl: number;
- /**
- * File name for the finished data
- * @default "output.json"
- */
- outputFileName: string;
- /** Optional cookie to be set. E.g. for Cookie Consent */
- cookie?: { name: string; value: string };
- /** Optional function to run for each page found */
- onVisitPage?: (options: {
- page: Page;
- pushData: (data: any) => Promise<void>;
- }) => Promise<void>;
- /** Optional timeout for waiting for a selector to appear */
- waitForSelectorTimeout?: number;
- };
- export const defaultConfig: Config = {
- url: "https://www.builder.io/c/docs/developers",
- match: "https://www.builder.io/c/docs/**",
- selector: `.docs-builder-container`,
- maxPagesToCrawl: 50,
- outputFileName: "../output.json",
- };
|