12345678910111213141516171819202122232425262728293031 |
- import { Page } from "playwright";
- type Config = {
- /** URL to start the crawl */
- url: string;
- /** Pattern to match against for links on a page to subsequently crawl */
- match: string;
- /** Selector to grab the inner text from */
- selector: string;
- /** Don't crawl more than this many pages */
- maxPagesToCrawl: number;
- /** File name for the finished data */
- outputFileName: string;
- /** Optional cookie to be set. E.g. for Cookie Consent */
- cookie?: { name: string; value: string };
- /** Optional function to run for each page found */
- onVisitPage?: (options: {
- page: Page;
- pushData: (data: any) => Promise<void>;
- }) => Promise<void>;
- /** Optional timeout for waiting for a selector to appear */
- waitForSelectorTimeout?: number;
- };
- export const config: Config = {
- url: "https://www.builder.io/c/docs/developers",
- match: "https://www.builder.io/c/docs/**",
- selector: `.docs-builder-container`,
- maxPagesToCrawl: 50,
- outputFileName: "output.json",
- };
|