osama
10/02/2024, 3:48 PMimport { PlaywrightCrawler, ProxyConfiguration } from 'crawlee';
import proxy from './proxy_config.js';
// PlaywrightCrawler crawls the web using a headless browser controlled by the Playwright library.
const proxyConfiguration = new ProxyConfiguration({
proxyUrls: [`http://${proxy.username}:${proxy.password}@${proxy.host}:${proxy.port}`]
});
const crawler = new PlaywrightCrawler({
// Use the requestHandler to process each of the crawled pages.
proxyConfiguration,
async requestHandler({ request, page, enqueueLinks, pushData, log }) {
const title = await page.title();
log.info(`Title of ${request.loadedUrl} is '${title}'`);
// Save results as JSON to `./storage/datasets/default` directory.
await pushData({ title, url: request.loadedUrl });
// Extract links from the current page and add them to the crawling queue.
await enqueueLinks();
},
// Uncomment this option to see the browser window.
// headless: false,
// Comment this option to scrape the full website.
maxRequestsPerCrawl: 20,
});
// Add first URL to the queue and start the crawl.
await crawler.run(['https://nopecha.com/demo/cloudflare']);
// Export the whole dataset to a single file in `./result.csv`.
await crawler.exportData('./result.csv');
// Or work with the data directly.
const data = await crawler.getData();
console.table(data.items);
https://cdn.discordapp.com/attachments/1291064360159936634/1291064360407531591/image.png?ex=66febce2&is=66fd6b62&hm=57c695d29e1a4c9d5b014b4d1df41cc85e505991d44daae8903a3f0511b3c68e&Hall
10/02/2024, 3:48 PMosama
10/02/2024, 3:54 PMHamza
10/03/2024, 12:18 AMosama
10/04/2024, 3:53 PM