mirror of
https://github.com/catdevnull/preciazo.git
synced 2024-11-26 11:36:20 +00:00
carrefour url scraper
This commit is contained in:
parent
2bac37df53
commit
73759ae6d9
5 changed files with 80 additions and 18 deletions
BIN
bun.lockb
BIN
bun.lockb
Binary file not shown.
44
carrefour-link-scraper/index.ts
Normal file
44
carrefour-link-scraper/index.ts
Normal file
|
@ -0,0 +1,44 @@
|
||||||
|
import pMap from "p-map";
|
||||||
|
import { saveUrls } from "db-datos/urlHelpers.js";
|
||||||
|
|
||||||
|
await scrapBySitemap();
|
||||||
|
|
||||||
|
export async function scrapCarrefourProducts() {
|
||||||
|
await scrapBySitemap();
|
||||||
|
}
|
||||||
|
|
||||||
|
async function scrapBySitemap() {
|
||||||
|
// de https://www.carrefour.com.ar/sitemap.xml
|
||||||
|
const sitemaps = [
|
||||||
|
"https://www.carrefour.com.ar/sitemap/product-0.xml",
|
||||||
|
"https://www.carrefour.com.ar/sitemap/product-1.xml",
|
||||||
|
"https://www.carrefour.com.ar/sitemap/product-2.xml",
|
||||||
|
"https://www.carrefour.com.ar/sitemap/product-3.xml",
|
||||||
|
"https://www.carrefour.com.ar/sitemap/product-4.xml",
|
||||||
|
"https://www.carrefour.com.ar/sitemap/product-5.xml",
|
||||||
|
"https://www.carrefour.com.ar/sitemap/product-6.xml",
|
||||||
|
"https://www.carrefour.com.ar/sitemap/product-7.xml",
|
||||||
|
"https://www.carrefour.com.ar/sitemap/product-8.xml",
|
||||||
|
"https://www.carrefour.com.ar/sitemap/product-9.xml",
|
||||||
|
];
|
||||||
|
|
||||||
|
await pMap(
|
||||||
|
sitemaps,
|
||||||
|
async (sitemapUrl) => {
|
||||||
|
const res = await fetch(sitemapUrl);
|
||||||
|
const xml = await res.text();
|
||||||
|
let urls = new Set<string>();
|
||||||
|
new HTMLRewriter()
|
||||||
|
.on("loc", {
|
||||||
|
text(element) {
|
||||||
|
const txt = element.text.trim();
|
||||||
|
if (!txt) return;
|
||||||
|
urls.add(txt);
|
||||||
|
},
|
||||||
|
})
|
||||||
|
.transform(new Response(xml));
|
||||||
|
saveUrls(Array.from(urls));
|
||||||
|
},
|
||||||
|
{ concurrency: 3 }
|
||||||
|
);
|
||||||
|
}
|
17
carrefour-link-scraper/package.json
Normal file
17
carrefour-link-scraper/package.json
Normal file
|
@ -0,0 +1,17 @@
|
||||||
|
{
|
||||||
|
"name": "carrefour-link-scraper",
|
||||||
|
"type": "module",
|
||||||
|
"version": "1.0.0",
|
||||||
|
"description": "",
|
||||||
|
"main": "index.js",
|
||||||
|
"scripts": {
|
||||||
|
"test": "echo \"Error: no test specified\" && exit 1"
|
||||||
|
},
|
||||||
|
"keywords": [],
|
||||||
|
"author": "",
|
||||||
|
"license": "ISC",
|
||||||
|
"dependencies": {
|
||||||
|
"linkedom": "^0.16.5",
|
||||||
|
"p-map": "^7.0.1"
|
||||||
|
}
|
||||||
|
}
|
|
@ -4,6 +4,7 @@
|
||||||
"workspaces": [
|
"workspaces": [
|
||||||
"dia-link-scraper",
|
"dia-link-scraper",
|
||||||
"coto-link-scraper",
|
"coto-link-scraper",
|
||||||
|
"carrefour-link-scraper",
|
||||||
"scraper",
|
"scraper",
|
||||||
"sitio",
|
"sitio",
|
||||||
"db-datos"
|
"db-datos"
|
||||||
|
|
|
@ -14,6 +14,7 @@ import { like } from "drizzle-orm";
|
||||||
import { productoUrls } from "db-datos/schema.js";
|
import { productoUrls } from "db-datos/schema.js";
|
||||||
import { scrapDiaProducts } from "../dia-link-scraper/index.js";
|
import { scrapDiaProducts } from "../dia-link-scraper/index.js";
|
||||||
import { scrapCotoProducts } from "../coto-link-scraper/index.js";
|
import { scrapCotoProducts } from "../coto-link-scraper/index.js";
|
||||||
|
import { scrapCarrefourProducts } from "../carrefour-link-scraper/index.js";
|
||||||
|
|
||||||
const supermercados: Supermercado[] = [
|
const supermercados: Supermercado[] = [
|
||||||
Supermercado.Carrefour,
|
Supermercado.Carrefour,
|
||||||
|
@ -79,12 +80,7 @@ class Auto {
|
||||||
const ctxPath = await mkdtemp(join(tmpdir(), "preciazo-scraper-wget-"));
|
const ctxPath = await mkdtemp(join(tmpdir(), "preciazo-scraper-wget-"));
|
||||||
|
|
||||||
let listPath: string;
|
let listPath: string;
|
||||||
if (supermercado === "Carrefour") {
|
{
|
||||||
// TODO: carrefour todavía no tiene un scraper que guarde a la BD
|
|
||||||
listPath = resolve(
|
|
||||||
join(process.env.LISTS_DIR ?? "../data", `${supermercado}.txt`)
|
|
||||||
);
|
|
||||||
} else {
|
|
||||||
const t0 = performance.now();
|
const t0 = performance.now();
|
||||||
switch (supermercado) {
|
switch (supermercado) {
|
||||||
case "Dia":
|
case "Dia":
|
||||||
|
@ -93,10 +89,14 @@ class Auto {
|
||||||
case "Coto":
|
case "Coto":
|
||||||
await scrapCotoProducts();
|
await scrapCotoProducts();
|
||||||
break;
|
break;
|
||||||
|
case "Carrefour":
|
||||||
|
await scrapCarrefourProducts();
|
||||||
|
break;
|
||||||
}
|
}
|
||||||
this.inform(
|
this.inform(
|
||||||
`[scrapUrls[${supermercado}]] Tardó ${formatMs(performance.now() - t0)}`
|
`[scrapUrls[${supermercado}]] Tardó ${formatMs(performance.now() - t0)}`
|
||||||
);
|
);
|
||||||
|
}
|
||||||
|
|
||||||
listPath = join(ctxPath, `lista-${supermercado}.txt`);
|
listPath = join(ctxPath, `lista-${supermercado}.txt`);
|
||||||
const host = Object.entries(hosts).find(
|
const host = Object.entries(hosts).find(
|
||||||
|
@ -109,7 +109,7 @@ class Auto {
|
||||||
.execute();
|
.execute();
|
||||||
const urls = results.map((r) => r.url);
|
const urls = results.map((r) => r.url);
|
||||||
await writeFile(listPath, urls.join("\n") + "\n");
|
await writeFile(listPath, urls.join("\n") + "\n");
|
||||||
}
|
|
||||||
const date = new Date();
|
const date = new Date();
|
||||||
const zstdWarcName = `${supermercado}-${format(
|
const zstdWarcName = `${supermercado}-${format(
|
||||||
date,
|
date,
|
||||||
|
|
Loading…
Reference in a new issue