mirror of
https://github.com/catdevnull/preciazo.git
synced 2024-11-26 11:36:20 +00:00
carrefour url scraper
This commit is contained in:
parent
2bac37df53
commit
73759ae6d9
5 changed files with 80 additions and 18 deletions
BIN
bun.lockb
BIN
bun.lockb
Binary file not shown.
44
carrefour-link-scraper/index.ts
Normal file
44
carrefour-link-scraper/index.ts
Normal file
|
@ -0,0 +1,44 @@
|
|||
import pMap from "p-map";
|
||||
import { saveUrls } from "db-datos/urlHelpers.js";
|
||||
|
||||
await scrapBySitemap();
|
||||
|
||||
export async function scrapCarrefourProducts() {
|
||||
await scrapBySitemap();
|
||||
}
|
||||
|
||||
async function scrapBySitemap() {
|
||||
// de https://www.carrefour.com.ar/sitemap.xml
|
||||
const sitemaps = [
|
||||
"https://www.carrefour.com.ar/sitemap/product-0.xml",
|
||||
"https://www.carrefour.com.ar/sitemap/product-1.xml",
|
||||
"https://www.carrefour.com.ar/sitemap/product-2.xml",
|
||||
"https://www.carrefour.com.ar/sitemap/product-3.xml",
|
||||
"https://www.carrefour.com.ar/sitemap/product-4.xml",
|
||||
"https://www.carrefour.com.ar/sitemap/product-5.xml",
|
||||
"https://www.carrefour.com.ar/sitemap/product-6.xml",
|
||||
"https://www.carrefour.com.ar/sitemap/product-7.xml",
|
||||
"https://www.carrefour.com.ar/sitemap/product-8.xml",
|
||||
"https://www.carrefour.com.ar/sitemap/product-9.xml",
|
||||
];
|
||||
|
||||
await pMap(
|
||||
sitemaps,
|
||||
async (sitemapUrl) => {
|
||||
const res = await fetch(sitemapUrl);
|
||||
const xml = await res.text();
|
||||
let urls = new Set<string>();
|
||||
new HTMLRewriter()
|
||||
.on("loc", {
|
||||
text(element) {
|
||||
const txt = element.text.trim();
|
||||
if (!txt) return;
|
||||
urls.add(txt);
|
||||
},
|
||||
})
|
||||
.transform(new Response(xml));
|
||||
saveUrls(Array.from(urls));
|
||||
},
|
||||
{ concurrency: 3 }
|
||||
);
|
||||
}
|
17
carrefour-link-scraper/package.json
Normal file
17
carrefour-link-scraper/package.json
Normal file
|
@ -0,0 +1,17 @@
|
|||
{
|
||||
"name": "carrefour-link-scraper",
|
||||
"type": "module",
|
||||
"version": "1.0.0",
|
||||
"description": "",
|
||||
"main": "index.js",
|
||||
"scripts": {
|
||||
"test": "echo \"Error: no test specified\" && exit 1"
|
||||
},
|
||||
"keywords": [],
|
||||
"author": "",
|
||||
"license": "ISC",
|
||||
"dependencies": {
|
||||
"linkedom": "^0.16.5",
|
||||
"p-map": "^7.0.1"
|
||||
}
|
||||
}
|
|
@ -4,6 +4,7 @@
|
|||
"workspaces": [
|
||||
"dia-link-scraper",
|
||||
"coto-link-scraper",
|
||||
"carrefour-link-scraper",
|
||||
"scraper",
|
||||
"sitio",
|
||||
"db-datos"
|
||||
|
|
|
@ -14,6 +14,7 @@ import { like } from "drizzle-orm";
|
|||
import { productoUrls } from "db-datos/schema.js";
|
||||
import { scrapDiaProducts } from "../dia-link-scraper/index.js";
|
||||
import { scrapCotoProducts } from "../coto-link-scraper/index.js";
|
||||
import { scrapCarrefourProducts } from "../carrefour-link-scraper/index.js";
|
||||
|
||||
const supermercados: Supermercado[] = [
|
||||
Supermercado.Carrefour,
|
||||
|
@ -79,12 +80,7 @@ class Auto {
|
|||
const ctxPath = await mkdtemp(join(tmpdir(), "preciazo-scraper-wget-"));
|
||||
|
||||
let listPath: string;
|
||||
if (supermercado === "Carrefour") {
|
||||
// TODO: carrefour todavía no tiene un scraper que guarde a la BD
|
||||
listPath = resolve(
|
||||
join(process.env.LISTS_DIR ?? "../data", `${supermercado}.txt`)
|
||||
);
|
||||
} else {
|
||||
{
|
||||
const t0 = performance.now();
|
||||
switch (supermercado) {
|
||||
case "Dia":
|
||||
|
@ -93,23 +89,27 @@ class Auto {
|
|||
case "Coto":
|
||||
await scrapCotoProducts();
|
||||
break;
|
||||
case "Carrefour":
|
||||
await scrapCarrefourProducts();
|
||||
break;
|
||||
}
|
||||
this.inform(
|
||||
`[scrapUrls[${supermercado}]] Tardó ${formatMs(performance.now() - t0)}`
|
||||
);
|
||||
|
||||
listPath = join(ctxPath, `lista-${supermercado}.txt`);
|
||||
const host = Object.entries(hosts).find(
|
||||
([host, supe]) => supe === supermercado
|
||||
)![0];
|
||||
const results = await db.query.productoUrls
|
||||
.findMany({
|
||||
where: like(productoUrls.url, `%${host}%`),
|
||||
})
|
||||
.execute();
|
||||
const urls = results.map((r) => r.url);
|
||||
await writeFile(listPath, urls.join("\n") + "\n");
|
||||
}
|
||||
|
||||
listPath = join(ctxPath, `lista-${supermercado}.txt`);
|
||||
const host = Object.entries(hosts).find(
|
||||
([host, supe]) => supe === supermercado
|
||||
)![0];
|
||||
const results = await db.query.productoUrls
|
||||
.findMany({
|
||||
where: like(productoUrls.url, `%${host}%`),
|
||||
})
|
||||
.execute();
|
||||
const urls = results.map((r) => r.url);
|
||||
await writeFile(listPath, urls.join("\n") + "\n");
|
||||
|
||||
const date = new Date();
|
||||
const zstdWarcName = `${supermercado}-${format(
|
||||
date,
|
||||
|
|
Loading…
Reference in a new issue