carrefour url scraper

This commit is contained in:
Cat /dev/Nulo 2023-12-29 22:58:37 -03:00
parent 2bac37df53
commit 73759ae6d9
5 changed files with 80 additions and 18 deletions

BIN
bun.lockb

Binary file not shown.

View file

@ -0,0 +1,44 @@
import pMap from "p-map";
import { saveUrls } from "db-datos/urlHelpers.js";
await scrapBySitemap();
export async function scrapCarrefourProducts() {
await scrapBySitemap();
}
async function scrapBySitemap() {
// de https://www.carrefour.com.ar/sitemap.xml
const sitemaps = [
"https://www.carrefour.com.ar/sitemap/product-0.xml",
"https://www.carrefour.com.ar/sitemap/product-1.xml",
"https://www.carrefour.com.ar/sitemap/product-2.xml",
"https://www.carrefour.com.ar/sitemap/product-3.xml",
"https://www.carrefour.com.ar/sitemap/product-4.xml",
"https://www.carrefour.com.ar/sitemap/product-5.xml",
"https://www.carrefour.com.ar/sitemap/product-6.xml",
"https://www.carrefour.com.ar/sitemap/product-7.xml",
"https://www.carrefour.com.ar/sitemap/product-8.xml",
"https://www.carrefour.com.ar/sitemap/product-9.xml",
];
await pMap(
sitemaps,
async (sitemapUrl) => {
const res = await fetch(sitemapUrl);
const xml = await res.text();
let urls = new Set<string>();
new HTMLRewriter()
.on("loc", {
text(element) {
const txt = element.text.trim();
if (!txt) return;
urls.add(txt);
},
})
.transform(new Response(xml));
saveUrls(Array.from(urls));
},
{ concurrency: 3 }
);
}

View file

@ -0,0 +1,17 @@
{
"name": "carrefour-link-scraper",
"type": "module",
"version": "1.0.0",
"description": "",
"main": "index.js",
"scripts": {
"test": "echo \"Error: no test specified\" && exit 1"
},
"keywords": [],
"author": "",
"license": "ISC",
"dependencies": {
"linkedom": "^0.16.5",
"p-map": "^7.0.1"
}
}

View file

@ -4,6 +4,7 @@
"workspaces": [ "workspaces": [
"dia-link-scraper", "dia-link-scraper",
"coto-link-scraper", "coto-link-scraper",
"carrefour-link-scraper",
"scraper", "scraper",
"sitio", "sitio",
"db-datos" "db-datos"

View file

@ -14,6 +14,7 @@ import { like } from "drizzle-orm";
import { productoUrls } from "db-datos/schema.js"; import { productoUrls } from "db-datos/schema.js";
import { scrapDiaProducts } from "../dia-link-scraper/index.js"; import { scrapDiaProducts } from "../dia-link-scraper/index.js";
import { scrapCotoProducts } from "../coto-link-scraper/index.js"; import { scrapCotoProducts } from "../coto-link-scraper/index.js";
import { scrapCarrefourProducts } from "../carrefour-link-scraper/index.js";
const supermercados: Supermercado[] = [ const supermercados: Supermercado[] = [
Supermercado.Carrefour, Supermercado.Carrefour,
@ -79,12 +80,7 @@ class Auto {
const ctxPath = await mkdtemp(join(tmpdir(), "preciazo-scraper-wget-")); const ctxPath = await mkdtemp(join(tmpdir(), "preciazo-scraper-wget-"));
let listPath: string; let listPath: string;
if (supermercado === "Carrefour") { {
// TODO: carrefour todavía no tiene un scraper que guarde a la BD
listPath = resolve(
join(process.env.LISTS_DIR ?? "../data", `${supermercado}.txt`)
);
} else {
const t0 = performance.now(); const t0 = performance.now();
switch (supermercado) { switch (supermercado) {
case "Dia": case "Dia":
@ -93,23 +89,27 @@ class Auto {
case "Coto": case "Coto":
await scrapCotoProducts(); await scrapCotoProducts();
break; break;
case "Carrefour":
await scrapCarrefourProducts();
break;
} }
this.inform( this.inform(
`[scrapUrls[${supermercado}]] Tardó ${formatMs(performance.now() - t0)}` `[scrapUrls[${supermercado}]] Tardó ${formatMs(performance.now() - t0)}`
); );
listPath = join(ctxPath, `lista-${supermercado}.txt`);
const host = Object.entries(hosts).find(
([host, supe]) => supe === supermercado
)![0];
const results = await db.query.productoUrls
.findMany({
where: like(productoUrls.url, `%${host}%`),
})
.execute();
const urls = results.map((r) => r.url);
await writeFile(listPath, urls.join("\n") + "\n");
} }
listPath = join(ctxPath, `lista-${supermercado}.txt`);
const host = Object.entries(hosts).find(
([host, supe]) => supe === supermercado
)![0];
const results = await db.query.productoUrls
.findMany({
where: like(productoUrls.url, `%${host}%`),
})
.execute();
const urls = results.map((r) => r.url);
await writeFile(listPath, urls.join("\n") + "\n");
const date = new Date(); const date = new Date();
const zstdWarcName = `${supermercado}-${format( const zstdWarcName = `${supermercado}-${format(
date, date,