123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124 |
- // For more information, see https://crawlee.dev/
- import { PlaywrightCrawler } from "crawlee";
- import { readFile, writeFile } from "fs/promises";
- import { glob } from "glob";
- import { Config } from "../config.js";
- import { Page } from "playwright";
- let pageCounter = 0;
- export function getPageHtml(page: Page, selector: string) {
- return page.evaluate((selector) => {
- // Check if the selector is an XPath
- if (selector.startsWith("/")) {
- const elements = document.evaluate(
- selector,
- document,
- null,
- XPathResult.ANY_TYPE,
- null,
- );
- let result = elements.iterateNext();
- return result ? result.textContent || "" : "";
- } else {
- // Handle as a CSS selector
- const el = document.querySelector(selector) as HTMLElement | null;
- return el?.innerText || "";
- }
- }, selector);
- }
- export async function waitForXPath(page: Page, xpath: string, timeout: number) {
- await page.waitForFunction(
- (xpath) => {
- const elements = document.evaluate(
- xpath,
- document,
- null,
- XPathResult.ANY_TYPE,
- null,
- );
- return elements.iterateNext() !== null;
- },
- xpath,
- { timeout },
- );
- }
- export async function crawl(config: Config) {
- if (process.env.NO_CRAWL !== "true") {
- // PlaywrightCrawler crawls the web using a headless
- // browser controlled by the Playwright library.
- const crawler = new PlaywrightCrawler({
- // Use the requestHandler to process each of the crawled pages.
- async requestHandler({ request, page, enqueueLinks, log, pushData }) {
- if (config.cookie) {
- // Set the cookie for the specific URL
- const cookie = {
- name: config.cookie.name,
- value: config.cookie.value,
- url: request.loadedUrl,
- };
- await page.context().addCookies([cookie]);
- }
- const title = await page.title();
- pageCounter++;
- log.info(
- `Crawling: Page ${pageCounter} / ${config.maxPagesToCrawl} - URL: ${request.loadedUrl}...`,
- );
- // Use custom handling for XPath selector
- if (config.selector.startsWith("/")) {
- await waitForXPath(
- page,
- config.selector,
- config.waitForSelectorTimeout ?? 1000,
- );
- } else {
- await page.waitForSelector(config.selector, {
- timeout: config.waitForSelectorTimeout ?? 1000,
- });
- }
- const html = await getPageHtml(page, config.selector);
- // Save results as JSON to ./storage/datasets/default
- await pushData({ title, url: request.loadedUrl, html });
- if (config.onVisitPage) {
- await config.onVisitPage({ page, pushData });
- }
- // Extract links from the current page
- // and add them to the crawling queue.
- await enqueueLinks({
- globs:
- typeof config.match === "string" ? [config.match] : config.match,
- });
- },
- // Comment this option to scrape the full website.
- maxRequestsPerCrawl: config.maxPagesToCrawl,
- // Uncomment this option to see the browser window.
- // headless: false,
- });
- // Add first URL to the queue and start the crawl.
- await crawler.run([config.url]);
- }
- }
- export async function write(config: Config) {
- const jsonFiles = await glob("storage/datasets/default/*.json", {
- absolute: true,
- });
- const results = [];
- for (const file of jsonFiles) {
- const data = JSON.parse(await readFile(file, "utf-8"));
- results.push(data);
- }
- await writeFile(config.outputFileName, JSON.stringify(results, null, 2));
- }
|