hunterleung.
09/08/2024, 6:53 AM【Create Crawl Task】- When clicked, it jumps to the "Add Task" page and set up the crawler parameters and rules (target URL, crawl deepth etc)
【Stop】- This button is displayed when the task is in the running state. When clicked, the current task will be stopped. And the task status changes to "stopped". In addition to manual stopping, this state is also required when the crawler terminates abnormally.
【Start】- This button is displayed when the task is in the stopped state.When clicked, the task configuration is reloaded and the task is started. The task status changes to running.
【View Configuration】- This button is visible in all states. When clicked, it jumps to the "Modify Tasks" page. Allows the user to modify configuration items at any time. If the task is currently running, it needs to be stopped and restarted to apply the new configuration.
What is the problem in my APP ?
1. When I stopped the first task and start the second task, The second task will use the parameters of the first task .
2. If a task is running ,it can not be restart when I click the "stop" button and click the "start" button again.
I don't know how to fix it . So I post the source code here. Is there someone could help me ? Thanks very much.
https://cdn.discordapp.com/attachments/1282232190633246731/1282232190805086240/image.png?ex=66de9b49&is=66dd49c9&hm=7b2f699b9e5218154e98cbc18b62df326a82ec684a87837365f4ce8c0fe3d5d4&Hall
09/08/2024, 6:53 AMhunterleung.
09/08/2024, 7:01 AMjavascript
import { ipcMain } from 'electron'
import { CrawlerTask, TaskStatus } from '../../types'
import { createCrawler } from './crawlerFactory'
import { PlaywrightCrawler } from 'crawlee'
import { runTaskCrawler } from './crawlerRunner'
const taskCrawler: Record<
number,
{
taskInfo: CrawlerTask
crawlPagesTotal: number
crawler: PlaywrightCrawler
}
> = {}
export function getTaskCrawler(taskId: number) {
return taskCrawler[taskId]
}
export function registeIpc() {
ipcMain.on('startCrawler', async (_event, task: CrawlerTask) => {
console.log(`\r\n start crawling,task name is 【${task.taskName}】`)
const crawler = createCrawler(task)
taskCrawler[task.taskId] = {
crawler,
taskInfo: task,
crawlPagesTotal: 0
}
task.status = TaskStatus.RUNING
await runTaskCrawler(crawler, task)
task.status = TaskStatus.COMPLETED
console.log(`task【${task.taskName}】done`)
})
ipcMain.on('pauseCrawler', (_event, taskId: number) => {
taskCrawler[taskId]?.crawler.autoscaledPool?.pause()
taskCrawler[taskId].taskInfo.status = TaskStatus.PAUSED
console.log(`task[${taskId}] crawler paused`)
})
ipcMain.on('resumeCrawler', (_event, taskId: number) => {
taskCrawler[taskId]?.crawler.autoscaledPool?.resume()
taskCrawler[taskId].taskInfo.status = TaskStatus.RUNING
console.log(`task[${taskId}] crawler resumed`)
})
}
hunterleung.
09/08/2024, 7:10 AMjavascript
import { createPlaywrightRouter } from 'crawlee'
import { CrawlerTask, SkipOperator } from '../../types'
import { getTaskCrawler } from '.'
export function routerFactory() {
const router = createPlaywrightRouter()
router.addDefaultHandler(async ctx => {
const userData = ctx.request.userData
console.log(userData)
const task = userData.task as CrawlerTask
const depth = userData.depth + 1
const limitCrawlDepth = task.limitCrawlDepth
const crawlPagesTotal = ++getTaskCrawler(task.taskId).crawlPagesTotal
const limitCrawlPagesTotal = task.limitCrawlPagesTotal as number
await collectData(ctx)
await ctx.enqueueLinks({
strategy: 'all',
userData: {
task,
depth
},
transformRequestFunction(req) {
const url = req.url
const skipOperator = task.skipOperator
if (task.skipType === 1) {
const skipKeywords = task.skipKeywords
.split('|')
.map(k => k.trim())
.filter(Boolean)
if (skipOperator === SkipOperator.INCLUDE) {
if (skipKeywords.some(k => url.includes(k))) return false
} else {
if (skipKeywords.every(k => !url.includes(k))) return false
}
} else if (task.skipType === 2) {
const skipRegex = new RegExp(task.skipUrlRegex)
}
return req
}
})
})
return router
}
async function collectData({ request, page, log }) {
log.info('current URL:' + request.url)
const title = await page.title()
let links = await page.$$eval('a', anchors => anchors.map(anchor => anchor.href))
links = Array.from(new Set(links.filter(Boolean).map(l => new URL(l).hostname)))
if (!title || links.length === 0) return
log.info('current page crawl success', {
url: request.url,
did: request.userData.did,
title,
links
})
}
ApifyBot
09/08/2024, 7:10 AMhunterleung.
09/08/2024, 7:13 AMjavascript
import { PlaywrightCrawler } from 'crawlee'
import { CrawlerTask, CrawlerType } from '../../types'
export async function runTaskCrawler(crawler: PlaywrightCrawler, task: CrawlerTask) {
switch (task.taskType) {
case CrawlerType.WEBSITE:
return await runWebsiteTaskCrawler(crawler, task)
default:
throw new Error('Invalid crawler type')
}
}
async function runWebsiteTaskCrawler(crawler: PlaywrightCrawler, task: CrawlerTask) {
console.log(task.sourceUrl)
await crawler.run([
{
url: task.sourceUrl,
userData: {
task,
depth: 0
}
}
])
}
async function runSerpsTaskCrawler(crawler: PlaywrightCrawler, task: CrawlerTask) {
console.log(crawler, task)
}
async function runLinksTaskCrawler(crawler: PlaywrightCrawler, task: CrawlerTask) {
console.log(crawler, task)
}
hunterleung.
09/08/2024, 7:17 AMjavascript
import { routerFactory } from './routerFactory'
import { CrawlerTask, CrawlerType } from '../../types'
import { Configuration, PlaywrightCrawler, ProxyConfiguration } from 'crawlee'
export function createCrawler(task: CrawlerTask) {
switch (task.taskType) {
case CrawlerType.WEBSITE:
return createWebsiteCrawler(task)
default:
throw new Error('Invalid crawler type')
}
}
function createWebsiteCrawler(task: CrawlerTask) {
let proxyConfiguration
if (task.proxyType === 1 && task.proxyRule) {
proxyConfiguration = new ProxyConfiguration({
proxyUrls: [task.proxyRule]
})
}
const crawler = new PlaywrightCrawler({
headless: true,
maxRequestRetries: 2,
sessionPoolOptions: {
maxPoolSize: 1000,
blockedStatusCodes: [429]
},
proxyConfiguration,
requestHandler: routerFactory(),
maxConcurrency: task.maxWorkerThreads
})
return crawler
}
hunterleung.
09/09/2024, 11:32 AM