carrefour url scraper

This commit is contained in:
Cat /dev/Nulo 2023-12-29 22:58:37 -03:00
parent 2bac37df53
commit 73759ae6d9
5 changed files with 80 additions and 18 deletions

BIN
bun.lockb

Binary file not shown.

View file

@ -0,0 +1,44 @@
import pMap from "p-map";
import { saveUrls } from "db-datos/urlHelpers.js";
await scrapBySitemap();
export async function scrapCarrefourProducts() {
await scrapBySitemap();
}
async function scrapBySitemap() {
// de https://www.carrefour.com.ar/sitemap.xml
const sitemaps = [
"https://www.carrefour.com.ar/sitemap/product-0.xml",
"https://www.carrefour.com.ar/sitemap/product-1.xml",
"https://www.carrefour.com.ar/sitemap/product-2.xml",
"https://www.carrefour.com.ar/sitemap/product-3.xml",
"https://www.carrefour.com.ar/sitemap/product-4.xml",
"https://www.carrefour.com.ar/sitemap/product-5.xml",
"https://www.carrefour.com.ar/sitemap/product-6.xml",
"https://www.carrefour.com.ar/sitemap/product-7.xml",
"https://www.carrefour.com.ar/sitemap/product-8.xml",
"https://www.carrefour.com.ar/sitemap/product-9.xml",
];
await pMap(
sitemaps,
async (sitemapUrl) => {
const res = await fetch(sitemapUrl);
const xml = await res.text();
let urls = new Set<string>();
new HTMLRewriter()
.on("loc", {
text(element) {
const txt = element.text.trim();
if (!txt) return;
urls.add(txt);
},
})
.transform(new Response(xml));
saveUrls(Array.from(urls));
},
{ concurrency: 3 }
);
}

View file

@ -0,0 +1,17 @@
{
"name": "carrefour-link-scraper",
"type": "module",
"version": "1.0.0",
"description": "",
"main": "index.js",
"scripts": {
"test": "echo \"Error: no test specified\" && exit 1"
},
"keywords": [],
"author": "",
"license": "ISC",
"dependencies": {
"linkedom": "^0.16.5",
"p-map": "^7.0.1"
}
}

View file

@ -4,6 +4,7 @@
"workspaces": [
"dia-link-scraper",
"coto-link-scraper",
"carrefour-link-scraper",
"scraper",
"sitio",
"db-datos"

View file

@ -14,6 +14,7 @@ import { like } from "drizzle-orm";
import { productoUrls } from "db-datos/schema.js";
import { scrapDiaProducts } from "../dia-link-scraper/index.js";
import { scrapCotoProducts } from "../coto-link-scraper/index.js";
import { scrapCarrefourProducts } from "../carrefour-link-scraper/index.js";
const supermercados: Supermercado[] = [
Supermercado.Carrefour,
@ -79,12 +80,7 @@ class Auto {
const ctxPath = await mkdtemp(join(tmpdir(), "preciazo-scraper-wget-"));
let listPath: string;
if (supermercado === "Carrefour") {
// TODO: carrefour todavía no tiene un scraper que guarde a la BD
listPath = resolve(
join(process.env.LISTS_DIR ?? "../data", `${supermercado}.txt`)
);
} else {
{
const t0 = performance.now();
switch (supermercado) {
case "Dia":
@ -93,10 +89,14 @@ class Auto {
case "Coto":
await scrapCotoProducts();
break;
case "Carrefour":
await scrapCarrefourProducts();
break;
}
this.inform(
`[scrapUrls[${supermercado}]] Tardó ${formatMs(performance.now() - t0)}`
);
}
listPath = join(ctxPath, `lista-${supermercado}.txt`);
const host = Object.entries(hosts).find(
@ -109,7 +109,7 @@ class Auto {
.execute();
const urls = results.map((r) => r.url);
await writeFile(listPath, urls.join("\n") + "\n");
}
const date = new Date();
const zstdWarcName = `${supermercado}-${format(
date,