core.ts 4.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155
  1. // For more information, see https://crawlee.dev/
  2. import { PlaywrightCrawler, downloadListOfUrls } from "crawlee";
  3. import { readFile, writeFile } from "fs/promises";
  4. import { glob } from "glob";
  5. import {Config, configSchema} from "./config.js";
  6. import { Page } from "playwright";
  7. let pageCounter = 0;
  8. export function getPageHtml(page: Page, selector = "body") {
  9. return page.evaluate((selector) => {
  10. // Check if the selector is an XPath
  11. if (selector.startsWith("/")) {
  12. const elements = document.evaluate(
  13. selector,
  14. document,
  15. null,
  16. XPathResult.ANY_TYPE,
  17. null
  18. );
  19. let result = elements.iterateNext();
  20. return result ? result.textContent || "" : "";
  21. } else {
  22. // Handle as a CSS selector
  23. const el = document.querySelector(selector) as HTMLElement | null;
  24. return el?.innerText || "";
  25. }
  26. }, selector);
  27. }
  28. export async function waitForXPath(page: Page, xpath: string, timeout: number) {
  29. await page.waitForFunction(
  30. (xpath) => {
  31. const elements = document.evaluate(
  32. xpath,
  33. document,
  34. null,
  35. XPathResult.ANY_TYPE,
  36. null
  37. );
  38. return elements.iterateNext() !== null;
  39. },
  40. xpath,
  41. { timeout }
  42. );
  43. }
  44. export async function crawl(config: Config) {
  45. configSchema.parse(config);
  46. if (process.env.NO_CRAWL !== "true") {
  47. // PlaywrightCrawler crawls the web using a headless
  48. // browser controlled by the Playwright library.
  49. const crawler = new PlaywrightCrawler({
  50. // Use the requestHandler to process each of the crawled pages.
  51. async requestHandler({ request, page, enqueueLinks, log, pushData }) {
  52. if (config.cookie) {
  53. // Set the cookie for the specific URL
  54. const cookie = {
  55. name: config.cookie.name,
  56. value: config.cookie.value,
  57. url: request.loadedUrl,
  58. };
  59. await page.context().addCookies([cookie]);
  60. }
  61. const title = await page.title();
  62. pageCounter++;
  63. log.info(
  64. `Crawling: Page ${pageCounter} / ${config.maxPagesToCrawl} - URL: ${request.loadedUrl}...`
  65. );
  66. // Use custom handling for XPath selector
  67. if (config.selector) {
  68. if (config.selector.startsWith("/")) {
  69. await waitForXPath(
  70. page,
  71. config.selector,
  72. config.waitForSelectorTimeout ?? 1000
  73. );
  74. } else {
  75. await page.waitForSelector(config.selector, {
  76. timeout: config.waitForSelectorTimeout ?? 1000,
  77. });
  78. }
  79. }
  80. const html = await getPageHtml(page, config.selector);
  81. // Save results as JSON to ./storage/datasets/default
  82. await pushData({ title, url: request.loadedUrl, html });
  83. if (config.onVisitPage) {
  84. await config.onVisitPage({ page, pushData });
  85. }
  86. // Extract links from the current page
  87. // and add them to the crawling queue.
  88. await enqueueLinks({
  89. globs:
  90. typeof config.match === "string" ? [config.match] : config.match,
  91. });
  92. },
  93. // Comment this option to scrape the full website.
  94. maxRequestsPerCrawl: config.maxPagesToCrawl,
  95. // Uncomment this option to see the browser window.
  96. // headless: false,
  97. preNavigationHooks: [
  98. // Abort requests for certain resource types
  99. async ({ page, log }) => {
  100. // If there are no resource exclusions, return
  101. const RESOURCE_EXCLUSTIONS = config.resourceExclusions ?? [];
  102. if (RESOURCE_EXCLUSTIONS.length === 0) {
  103. return;
  104. }
  105. await page.route(`**\/*.{${RESOURCE_EXCLUSTIONS.join()}}`, route => route.abort('aborted'));
  106. log.info(`Aborting requests for as this is a resource excluded route`);
  107. }
  108. ],
  109. });
  110. const SITEMAP_SUFFIX = "sitemap.xml";
  111. const isUrlASitemap = config.url.endsWith(SITEMAP_SUFFIX);
  112. if (isUrlASitemap) {
  113. const listOfUrls = await downloadListOfUrls({ url: config.url });
  114. // Add the initial URL to the crawling queue.
  115. await crawler.addRequests(listOfUrls);
  116. // Run the crawler
  117. await crawler.run();
  118. } else {
  119. // Add first URL to the queue and start the crawl.
  120. await crawler.run([config.url]);
  121. }
  122. }
  123. }
  124. export async function write(config: Config) {
  125. configSchema.parse(config);
  126. const jsonFiles = await glob("storage/datasets/default/*.json", {
  127. absolute: true,
  128. });
  129. const results = [];
  130. for (const file of jsonFiles) {
  131. const data = JSON.parse(await readFile(file, "utf-8"));
  132. results.push(data);
  133. }
  134. await writeFile(config.outputFileName, JSON.stringify(results, null, 2));
  135. }