main.ts 2.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475
  1. // For more information, see https://crawlee.dev/
  2. import { PlaywrightCrawler } from "crawlee";
  3. import { readFile, writeFile } from "fs/promises";
  4. import { glob } from "glob";
  5. import { config } from "../config.js";
  6. import { Page } from "playwright";
  7. export function getPageHtml(page: Page) {
  8. return page.evaluate((selector) => {
  9. const el = document.querySelector(selector) as HTMLElement | null;
  10. return el?.innerText || "";
  11. }, config.selector);
  12. }
  13. if (process.env.NO_CRAWL !== "true") {
  14. // PlaywrightCrawler crawls the web using a headless
  15. // browser controlled by the Playwright library.
  16. const crawler = new PlaywrightCrawler({
  17. // Use the requestHandler to process each of the crawled pages.
  18. async requestHandler({ request, page, enqueueLinks, log, pushData }) {
  19. if(config.cookie) {
  20. // Set the cookie for the specific URL
  21. const cookie = {
  22. name: config.cookie.name,
  23. value: config.cookie.value,
  24. url: request.loadedUrl,
  25. };
  26. await page.context().addCookies([cookie]);
  27. }
  28. const title = await page.title();
  29. log.info(`Crawling ${request.loadedUrl}...`);
  30. await page.waitForSelector(config.selector, {
  31. timeout: 1000,
  32. });
  33. const html = await getPageHtml(page);
  34. // Save results as JSON to ./storage/datasets/default
  35. await pushData({ title, url: request.loadedUrl, html });
  36. if (config.onVisitPage) {
  37. await config.onVisitPage({ page, pushData });
  38. }
  39. // Extract links from the current page
  40. // and add them to the crawling queue.
  41. await enqueueLinks({
  42. globs: [config.match],
  43. });
  44. },
  45. // Comment this option to scrape the full website.
  46. maxRequestsPerCrawl: config.maxPagesToCrawl,
  47. // Uncomment this option to see the browser window.
  48. // headless: false,
  49. });
  50. // Add first URL to the queue and start the crawl.
  51. await crawler.run([config.url]);
  52. }
  53. const jsonFiles = await glob("storage/datasets/default/*.json", {
  54. absolute: true,
  55. });
  56. const results = [];
  57. for (const file of jsonFiles) {
  58. const data = JSON.parse(await readFile(file, "utf-8"));
  59. results.push(data);
  60. }
  61. await writeFile(config.outputFileName, JSON.stringify(results, null, 2));