Nel549
03/15/2025, 7:02 PMjavascript
import { crawler } from './main.js' // Import the exported crawler from main file
import express from "express";
const app = express();
app.use(express.json());
const BASE_URL = "https.....";
app.post("/scrape", async (req, res) => {
if (!req.body || !req.body.usernames) {
return res.status(400).json({ error: "Invalid input" });
}
const { usernames } = req.body;
const urls = usernames.map(username => `${BASE_URL}${username}`);
try {
await crawler.run(urls);
const dataset = await crawler.getData();
return res.status(200).json({ data: dataset });
} catch (error) {
console.error("Scraping error:", error);
return res.status(500).json({ error: "Scraping failed" });
}
});
const PORT = parseInt(process.env.PORT) || 3000;
app.listen(PORT, () => console.log(`Server running on port ${PORT}`));
Here is how my crawler look:
javascript
const proxies = [...] //my proxy list
const proxyConfiguration = new ProxyConfiguration({
proxyUrls: proxies,
});
export const crawler = new CheerioCrawler({
proxyConfiguration,
requestHandler: async ({ request, json, proxyInfo }) => {
log.info(JSON.stringify(proxyInfo, null, 2))
/// Scraping logic
await Dataset.pushData({
// pushing data
});
}, new Configuration({
persistStorage: false,
}));
Hall
03/15/2025, 7:02 PMOleg V.
03/17/2025, 6:52 PM// Define a function that creates a new crawler instance for each request
const createCrawler = () => {
const proxies = [...]; // your proxy list
const proxyConfiguration = new ProxyConfiguration({
proxyUrls: proxies,
});
return new CheerioCrawler({
proxyConfiguration,
// Turn off persistence to avoid storage conflicts
useSessionPool: false,
requestHandler: async ({ request, json, proxyInfo }) => {
console.log(JSON.stringify(proxyInfo, null, 2));
// Scraping logic
await Dataset.pushData({
// pushing data
});
}
}, new Configuration({
persistStorage: false,
}));
};
// Create a new crawler instance for each request
const crawler = createCrawler();
// Run the crawler with the provided URLs
await crawler.run(urls);