| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475 |
- // For more information, see https://crawlee.dev/
- import { PlaywrightCrawler } from "crawlee";
- import { readFile, writeFile } from "fs/promises";
- import { glob } from "glob";
- import { config } from "../config.js";
- import { Page } from "playwright";
- export function getPageHtml(page: Page) {
- return page.evaluate((selector) => {
- const el = document.querySelector(selector) as HTMLElement | null;
- return el?.innerText || "";
- }, config.selector);
- }
- if (process.env.NO_CRAWL !== "true") {
- // PlaywrightCrawler crawls the web using a headless
- // browser controlled by the Playwright library.
- const crawler = new PlaywrightCrawler({
- // Use the requestHandler to process each of the crawled pages.
- async requestHandler({ request, page, enqueueLinks, log, pushData }) {
- if(config.cookie) {
- // Set the cookie for the specific URL
- const cookie = {
- name: config.cookie.name,
- value: config.cookie.value,
- url: request.loadedUrl,
- };
- await page.context().addCookies([cookie]);
- }
- const title = await page.title();
- log.info(`Crawling ${request.loadedUrl}...`);
- await page.waitForSelector(config.selector, {
- timeout: 1000,
- });
- const html = await getPageHtml(page);
- // Save results as JSON to ./storage/datasets/default
- await pushData({ title, url: request.loadedUrl, html });
- if (config.onVisitPage) {
- await config.onVisitPage({ page, pushData });
- }
- // Extract links from the current page
- // and add them to the crawling queue.
- await enqueueLinks({
- globs: [config.match],
- });
- },
- // Comment this option to scrape the full website.
- maxRequestsPerCrawl: config.maxPagesToCrawl,
- // Uncomment this option to see the browser window.
- // headless: false,
- });
- // Add first URL to the queue and start the crawl.
- await crawler.run([config.url]);
- }
- const jsonFiles = await glob("storage/datasets/default/*.json", {
- absolute: true,
- });
- const results = [];
- for (const file of jsonFiles) {
- const data = JSON.parse(await readFile(file, "utf-8"));
- results.push(data);
- }
- await writeFile(config.outputFileName, JSON.stringify(results, null, 2));
|