kevinswiber
11/28/2024, 12:33 AMContent-Type
header of responses and download JSON or YAML files? I'm using Playwright to crawl my sites and have some JSON and YAML content I would like to capture, as well.Hall
11/28/2024, 12:33 AMApifyBot
11/28/2024, 12:54 AMExp
11/28/2024, 12:54 AMkevinswiber
11/28/2024, 1:40 AMExp
11/28/2024, 2:25 AMExp
11/28/2024, 2:25 AMkevinswiber
11/28/2024, 8:13 AMazzouzana
11/28/2024, 9:49 PMpreNavigationHooks:[
async (crawlingContext) => {
crawlingContext.page.on('response', async function (response) {
if (response.url().includes('......') {
const body = await response.json();
// check content type here and decide what to do here
const headers = response.headers();
const body = await response.json() // or text()
}
}
)
}
]
kevinswiber
12/01/2024, 1:25 AMpreNavigationHooks: [
async ({ request, page, log }) => {
page.once("response", async (response) => {
const contentType = response.headers()["content-type"];
if (
contentType?.includes("application/json") ||
contentType?.includes("application/x-yaml") ||
contentType?.includes("text/yaml") ||
contentType?.includes("application/yaml")
) {
const type = contentType.includes("json") ? "json" : "yaml";
request.skipNavigation = true;
request.noRetry = true;
await new Promise(async (resolve, reject) => {
page.once("download", async (download) => {
const path = await download.path();
const content = await readFile(path, "utf8");
await dataset.pushData({
url: download.url(),
contentType,
filename: download.suggestedFilename(),
type,
content,
});
log.info(`Downloaded ${type}: ${download.url()}`);
resolve();
});
await page.waitForEvent("download");
});
}
});
},
],
azzouzana
12/01/2024, 1:36 AMkevinswiber
12/01/2024, 3:50 AMazzouzana
12/01/2024, 12:15 PM