core.ts 3.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124
  1. // For more information, see https://crawlee.dev/
  2. import { PlaywrightCrawler } from "crawlee";
  3. import { readFile, writeFile } from "fs/promises";
  4. import { glob } from "glob";
  5. import { Config } from "../config.js";
  6. import { Page } from "playwright";
  7. let pageCounter = 0;
  8. export function getPageHtml(page: Page, selector: string) {
  9. return page.evaluate((selector) => {
  10. // Check if the selector is an XPath
  11. if (selector.startsWith("/")) {
  12. const elements = document.evaluate(
  13. selector,
  14. document,
  15. null,
  16. XPathResult.ANY_TYPE,
  17. null,
  18. );
  19. let result = elements.iterateNext();
  20. return result ? result.textContent || "" : "";
  21. } else {
  22. // Handle as a CSS selector
  23. const el = document.querySelector(selector) as HTMLElement | null;
  24. return el?.innerText || "";
  25. }
  26. }, selector);
  27. }
  28. export async function waitForXPath(page: Page, xpath: string, timeout: number) {
  29. await page.waitForFunction(
  30. (xpath) => {
  31. const elements = document.evaluate(
  32. xpath,
  33. document,
  34. null,
  35. XPathResult.ANY_TYPE,
  36. null,
  37. );
  38. return elements.iterateNext() !== null;
  39. },
  40. xpath,
  41. { timeout },
  42. );
  43. }
  44. export async function crawl(config: Config) {
  45. if (process.env.NO_CRAWL !== "true") {
  46. // PlaywrightCrawler crawls the web using a headless
  47. // browser controlled by the Playwright library.
  48. const crawler = new PlaywrightCrawler({
  49. // Use the requestHandler to process each of the crawled pages.
  50. async requestHandler({ request, page, enqueueLinks, log, pushData }) {
  51. if (config.cookie) {
  52. // Set the cookie for the specific URL
  53. const cookie = {
  54. name: config.cookie.name,
  55. value: config.cookie.value,
  56. url: request.loadedUrl,
  57. };
  58. await page.context().addCookies([cookie]);
  59. }
  60. const title = await page.title();
  61. pageCounter++;
  62. log.info(
  63. `Crawling: Page ${pageCounter} / ${config.maxPagesToCrawl} - URL: ${request.loadedUrl}...`,
  64. );
  65. // Use custom handling for XPath selector
  66. if (config.selector.startsWith("/")) {
  67. await waitForXPath(
  68. page,
  69. config.selector,
  70. config.waitForSelectorTimeout ?? 1000,
  71. );
  72. } else {
  73. await page.waitForSelector(config.selector, {
  74. timeout: config.waitForSelectorTimeout ?? 1000,
  75. });
  76. }
  77. const html = await getPageHtml(page, config.selector);
  78. // Save results as JSON to ./storage/datasets/default
  79. await pushData({ title, url: request.loadedUrl, html });
  80. if (config.onVisitPage) {
  81. await config.onVisitPage({ page, pushData });
  82. }
  83. // Extract links from the current page
  84. // and add them to the crawling queue.
  85. await enqueueLinks({
  86. globs:
  87. typeof config.match === "string" ? [config.match] : config.match,
  88. });
  89. },
  90. // Comment this option to scrape the full website.
  91. maxRequestsPerCrawl: config.maxPagesToCrawl,
  92. // Uncomment this option to see the browser window.
  93. // headless: false,
  94. });
  95. // Add first URL to the queue and start the crawl.
  96. await crawler.run([config.url]);
  97. }
  98. }
  99. export async function write(config: Config) {
  100. const jsonFiles = await glob("storage/datasets/default/*.json", {
  101. absolute: true,
  102. });
  103. const results = [];
  104. for (const file of jsonFiles) {
  105. const data = JSON.parse(await readFile(file, "utf-8"));
  106. results.push(data);
  107. }
  108. await writeFile(config.outputFileName, JSON.stringify(results, null, 2));
  109. }