config.ts 897 B

123456789101112131415161718192021222324252627282930
  1. import { Page } from "playwright";
  2. type Config = {
  3. /** URL to start the crawl */
  4. url: string;
  5. /** Pattern to match against for links on a page to subsequently crawl */
  6. match: string;
  7. /** Selector to grab the inner text from */
  8. selector: string;
  9. /** Don't crawl more than this many pages */
  10. maxPagesToCrawl: number;
  11. /** File name for the finished data */
  12. outputFileName: string;
  13. /** Optional cookie to be set. E.g. for Cookie Consent */
  14. cookie?: {name: string; value: string}
  15. /** Optional function to run for each page found */
  16. onVisitPage?: (options: {
  17. page: Page;
  18. pushData: (data: any) => Promise<void>;
  19. }) => Promise<void>;
  20. };
  21. export const config: Config = {
  22. url: "https://www.builder.io/c/docs/developers",
  23. match: "https://www.builder.io/c/docs/**",
  24. selector: `.docs-builder-container`,
  25. maxPagesToCrawl: 1000,
  26. outputFileName: "output.json",
  27. };