André Mácola
08/01/2025, 8:56 PMbanyan
07/29/2025, 2:37 PMthenetaji
07/26/2025, 5:10 PMbanyan
07/24/2025, 6:56 AMfierDeToiMonGrand
07/17/2025, 8:11 PMimport { RequestQueue } from "crawlee";
let jobQueue: RequestQueue;
async function initializeJobQueue() {
if (!jobQueue) {
jobQueue = await RequestQueue.open("job-deduplication-queue");
}
}
async function fetchJobPages(page: Page, jobIds: string[], origin: string) {
await initializeJobQueue();
const filteredJobIds = [];
if (saveOnlyUniqueItems) {
for (const jobId of jobIds) {
const jobUrl = `${origin}/viewjob?jk=${jobId}`;
const request = await jobQueue.addRequest({ url: jobUrl });
if (!request.wasAlreadyPresent) filteredJobIds.push(jobId);
}
} else {
filteredJobIds.push(...jobIds);
}
myLog(
`Filtered ${jobIds.length - filteredJobIds.length} duplicates, ` +
`processing ${filteredJobIds.length} unique jobs.`
);
// fetchJobWithRetry and batching logic follows...
}
Am i using the request correctly, I am not using the default one from the crawler because my scrapping logic does not allow it.Answer Overflow
07/17/2025, 5:39 PMNeoNomade | Scraping hellhound
07/14/2025, 10:29 AMRói
07/02/2025, 8:09 AMBryan Allred
07/01/2025, 10:25 PMRói
06/30/2025, 11:59 PMsev-puri
06/29/2025, 4:12 PMJOE
06/28/2025, 5:28 PMAnswer Overflow
06/22/2025, 9:04 PMMrSquaare
06/19/2025, 5:47 PMArz
06/16/2025, 8:04 AMムNKiT
06/10/2025, 6:44 AMAmal Chandran
06/09/2025, 4:06 AM𝓓𝙤𝙘𝙠𝙚𝙧𝙨
06/07/2025, 5:34 PMjs
import { PuppeteerCrawler, Dataset } from "crawlee";
import { router } from "./routes.js";
const crawler = new PuppeteerCrawler({
launchContext: {
useChrome: true,
userDataDir: 'C:\Users\enric\AppData\Local\Google\Chrome\User Data\Default',
launchOptions: {
headless: false,
}
},
requestHandler: router,
async failedRequestHandler({ request }) {
// This function is called when the crawling of a request failed too many times
await Dataset.pushData({
url: request.url,
succeeded: false,
errors: request.errorMessages,
})
},
});
await crawler.run([
'mylink'
]);
Still the crawler opens without the Chrome Profile 🙂FoudreTower
06/07/2025, 12:14 AMjs
router.addHandler(
"search",
async ({ request, page, log, pushData, enqueueLinks }) => {
log.info(`Processing ${request.url} ...`);
// Find all search result items
const searchElements = await page
.locator("div#search div#rso div[data-hveid][lang]")
.all();
for (let index = 0; index < 1; index++) {
const element = searchElements[index];
const url =
(await element.locator("a").first().getAttribute("href"))
await enqueueLinks({
label: "subprocessors",
urls: [url],
});
}
console.log(await Dataset.getData());
}
);
// Not triggered
router.addHandler("subprocessors", async ({ request, page, log, pushData }) => {
log.info(`Processing ${request.url} ...`);
});
MrSquaare
06/03/2025, 6:39 PMHustler
06/03/2025, 4:17 AMMrSquaare
06/01/2025, 1:02 PMTalles
05/29/2025, 9:57 PM🅰ndrew ✪
05/22/2025, 4:25 PMhunterleung.
05/22/2025, 4:20 PMHall
05/18/2025, 7:47 PMAmal Chandran
05/16/2025, 1:50 AMMrSquaare
05/10/2025, 9:18 PMBageDevimo
05/04/2025, 11:02 PMNeoNomade | Scraping hellhound
04/23/2025, 10:32 AMtypescript
preNavigationHooks: [
async (gotoOptions) => {
gotoOptions.waitUntil = "load";
},
async ({page}) => {
await page.route("**/*", async (route) => {
const url = route.request().url();
const resourceType = route.request().resourceType();
const trackingScriptRegex =
/googletagmanager|facebook|sentry|ads|tracking|metrics|analytics|optimizely|segment/i;
const extraBlocklistRegex =
/tiktok|facebook|prismic-images|bing|ads|tracking|metrics|analytics|contentsquare|lytics|adtrafficquality|adsrvr|tmol|snapchat|ticketm\.net/i;
const isBlockedResourceType = ["stylesheet", "font", "media"].includes(resourceType);
const isBlockedScript = resourceType === "script" && trackingScriptRegex.test(url);
const isBlockedByExtraPatterns = extraBlocklistRegex.test(url);
const shouldBlock =
!url.includes("recaptcha") &&
(isBlockedResourceType || isBlockedScript || isBlockedByExtraPatterns);
if (shouldBlock) {
await route.abort();
return;
}
await route.continue();
});
},
],