mirror of
https://github.com/catdevnull/preciazo.git
synced 2024-11-22 22:26:19 +00:00
scrapear dia y carrefour via bunjs
This commit is contained in:
parent
5e55ad7131
commit
d85b54f837
8 changed files with 227 additions and 102 deletions
4
.gitignore
vendored
4
.gitignore
vendored
|
@ -3,4 +3,6 @@ data/carrefour
|
|||
p.*
|
||||
p
|
||||
node_modules/
|
||||
*.db
|
||||
*.db
|
||||
scraper/debug/
|
||||
scraper/x.tsv
|
|
@ -43,7 +43,7 @@ importers:
|
|||
version: 9.2.2
|
||||
drizzle-orm:
|
||||
specifier: ^0.29.1
|
||||
version: 0.29.1(@types/better-sqlite3@7.6.8)(better-sqlite3@9.2.2)
|
||||
version: 0.29.1(@types/better-sqlite3@7.6.8)(better-sqlite3@9.2.2)(bun-types@1.0.19)
|
||||
linkedom:
|
||||
specifier: ^0.16.5
|
||||
version: 0.16.5
|
||||
|
@ -59,6 +59,9 @@ importers:
|
|||
warcio:
|
||||
specifier: ^2.2.1
|
||||
version: 2.2.1
|
||||
zod:
|
||||
specifier: ^3.22.4
|
||||
version: 3.22.4
|
||||
devDependencies:
|
||||
'@types/better-sqlite3':
|
||||
specifier: ^7.6.8
|
||||
|
@ -66,6 +69,9 @@ importers:
|
|||
'@types/node':
|
||||
specifier: ^20.10.5
|
||||
version: 20.10.5
|
||||
bun-types:
|
||||
specifier: ^1.0.19
|
||||
version: 1.0.19
|
||||
tsx:
|
||||
specifier: ^4.7.0
|
||||
version: 4.7.0
|
||||
|
@ -274,6 +280,11 @@ packages:
|
|||
dependencies:
|
||||
undici-types: 5.26.5
|
||||
|
||||
/@types/ws@8.5.10:
|
||||
resolution: {integrity: sha512-vmQSUcfalpIq0R9q7uTo2lXs6eGIpt9wtnLdMv9LVpIjCA/+ufZRozlVoVelIYixx1ugCBKDhn89vnsEGOCx9A==}
|
||||
dependencies:
|
||||
'@types/node': 20.10.5
|
||||
|
||||
/ansi-regex@5.0.1:
|
||||
resolution: {integrity: sha512-quJQXlTSUGL2LH9SUXo8VwsY4soanhgo6LNSm84E1LBcE8s3O0wpdiRzyR9z/ZZJMlMWv37qOOb9pdJlMUEKFQ==}
|
||||
engines: {node: '>=8'}
|
||||
|
@ -330,6 +341,13 @@ packages:
|
|||
ieee754: 1.2.1
|
||||
dev: false
|
||||
|
||||
/bun-types@1.0.19:
|
||||
resolution: {integrity: sha512-7P5/r+twssrkDQ6HMit2GARMBbAxz1tLLEcMgQOCZeCX9BzNtabktjPCu+DmcvDYDnL/Ke75pmKg9CNBTlCzlQ==}
|
||||
dependencies:
|
||||
'@types/node': 20.10.5
|
||||
'@types/ws': 8.5.10
|
||||
undici-types: 5.26.5
|
||||
|
||||
/chownr@1.1.4:
|
||||
resolution: {integrity: sha512-jJ0bqzaylmJtVnNgzTeSOs8DPavpbYgEr/b0YL8/2GO3xJEhInFmhKMUnEJQjZumK7KXGFhUy89PrsJWlakBVg==}
|
||||
dev: false
|
||||
|
@ -424,7 +442,7 @@ packages:
|
|||
domhandler: 5.0.3
|
||||
dev: false
|
||||
|
||||
/drizzle-orm@0.29.1(@types/better-sqlite3@7.6.8)(better-sqlite3@9.2.2):
|
||||
/drizzle-orm@0.29.1(@types/better-sqlite3@7.6.8)(better-sqlite3@9.2.2)(bun-types@1.0.19):
|
||||
resolution: {integrity: sha512-yItc4unfHnk8XkDD3/bdC63vdboTY7e7I03lCF1OJYABXSIfQYU9BFTQJXMMovVeb3T1/OJWwfW/70T1XPnuUA==}
|
||||
peerDependencies:
|
||||
'@aws-sdk/client-rds-data': '>=3'
|
||||
|
@ -488,6 +506,7 @@ packages:
|
|||
dependencies:
|
||||
'@types/better-sqlite3': 7.6.8
|
||||
better-sqlite3: 9.2.2
|
||||
bun-types: 1.0.19
|
||||
dev: false
|
||||
|
||||
/emoji-regex@8.0.0:
|
||||
|
@ -955,3 +974,7 @@ packages:
|
|||
y18n: 5.0.8
|
||||
yargs-parser: 21.1.1
|
||||
dev: false
|
||||
|
||||
/zod@3.22.4:
|
||||
resolution: {integrity: sha512-iC+8Io04lddc+mVqQ9AZ7OQ2MrUKGN+oIQyq1vemgt46jwCwLfhq7/pwnBnNXXXZb8VTVLKwp9EDkx+ryxIWmg==}
|
||||
dev: false
|
||||
|
|
70
scraper/carrefour.ts
Normal file
70
scraper/carrefour.ts
Normal file
|
@ -0,0 +1,70 @@
|
|||
import { parseHTML } from "linkedom";
|
||||
import { Precioish, type Precio } from "./scrap.js";
|
||||
import { getProductJsonLd, priceFromMeta } from "./common.js";
|
||||
|
||||
function getEanByTable(dom: Window): string {
|
||||
const eanLabelEl = dom.window.document.querySelector(
|
||||
'td[data-specification="EAN"]'
|
||||
);
|
||||
const eanValueEl = eanLabelEl?.parentElement?.children[1];
|
||||
if (
|
||||
!eanValueEl ||
|
||||
!(eanValueEl instanceof dom.window.HTMLElement) ||
|
||||
!eanValueEl.dataset.specification
|
||||
)
|
||||
throw new Error("No encontré el EAN");
|
||||
return eanValueEl.dataset.specification;
|
||||
}
|
||||
function parseScriptJson<T>(dom: Window, varname: string): T {
|
||||
const script = dom.window.document.querySelector<HTMLTemplateElement>(
|
||||
`template[data-type="json"][data-varname="${varname}"]`
|
||||
)?.content?.children[0];
|
||||
if (!script) throw new Error("no encuentro el script");
|
||||
return JSON.parse(script.innerHTML);
|
||||
}
|
||||
function eanFromSeedState(dom: Window): string {
|
||||
const json = parseScriptJson<object>(dom, "__STATE__");
|
||||
const productJson = Object.entries(json).find(
|
||||
([key, val]) => key.startsWith("Product:") && val.__typename === "Product"
|
||||
);
|
||||
if (!productJson) throw new Error("no encontré el product en el json");
|
||||
|
||||
const productSkuJson = Object.entries(json).find(
|
||||
([key, val]) =>
|
||||
key.startsWith(`Product:${productJson[1].cacheId}`) &&
|
||||
val.__typename === "SKU"
|
||||
);
|
||||
if (!productSkuJson) throw new Error("no encontré el sku en el json");
|
||||
return productSkuJson[1].ean;
|
||||
}
|
||||
function eanFromDynamicYieldScript(dom: Window): string {
|
||||
const scriptEl = dom.window.document.querySelector(
|
||||
`script[src^="//st.dynamicyield.com/st?"]`
|
||||
);
|
||||
if (!scriptEl || !(scriptEl instanceof dom.window.HTMLScriptElement))
|
||||
throw new Error("no encuentro el script de dynamicyield");
|
||||
|
||||
const url = new URL(scriptEl.src);
|
||||
const ctx = url.searchParams.get("ctx");
|
||||
if (!ctx) throw new Error("no hay ctx");
|
||||
return JSON.parse(ctx).data[0];
|
||||
}
|
||||
|
||||
export function getCarrefourProduct(html: string | Buffer): Precioish {
|
||||
const dom = parseHTML(html);
|
||||
|
||||
const precioCentavos = priceFromMeta(dom);
|
||||
|
||||
// const productLd = findJsonLd(dom, "Product");
|
||||
const ean = eanFromSeedState(dom);
|
||||
|
||||
const ld = getProductJsonLd(dom);
|
||||
const inStock =
|
||||
ld.offers.offers[0].availability === "http://schema.org/InStock";
|
||||
|
||||
return {
|
||||
ean,
|
||||
precioCentavos,
|
||||
inStock,
|
||||
};
|
||||
}
|
50
scraper/common.ts
Normal file
50
scraper/common.ts
Normal file
|
@ -0,0 +1,50 @@
|
|||
import { z } from "zod";
|
||||
|
||||
export function getMetaProp(dom: Window, prop: string) {
|
||||
return dom.window.document
|
||||
.querySelector(`meta[property="${prop}"]`)
|
||||
?.getAttribute("content");
|
||||
}
|
||||
|
||||
export function priceFromMeta(dom: Window) {
|
||||
const precioMeta = getMetaProp(dom, "product:price:amount");
|
||||
if (!precioMeta) return null;
|
||||
const precioCentavos = parseFloat(precioMeta) * 100;
|
||||
return precioCentavos;
|
||||
}
|
||||
|
||||
function parseJsonLds(dom: Window): object[] {
|
||||
const scripts = dom.window.document.querySelectorAll(
|
||||
'script[type="application/ld+json"]'
|
||||
);
|
||||
return [...scripts].map((scripts) => JSON.parse(scripts.innerHTML));
|
||||
}
|
||||
function findJsonLd(dom: Window, type: string): object | undefined {
|
||||
return parseJsonLds(dom).find((x) => "@type" in x && x["@type"] === type);
|
||||
}
|
||||
|
||||
const zProductLd = z.object({
|
||||
"@type": z.literal("Product"),
|
||||
name: z.string(),
|
||||
image: z.string(),
|
||||
offers: z.object({
|
||||
offers: z.tuple([
|
||||
z.object({
|
||||
"@type": z.literal("Offer"),
|
||||
price: z.number(),
|
||||
priceCurrency: z.literal("ARS"),
|
||||
availability: z.enum([
|
||||
"http://schema.org/OutOfStock",
|
||||
"http://schema.org/InStock",
|
||||
]),
|
||||
}),
|
||||
]),
|
||||
}),
|
||||
});
|
||||
type ProductLd = z.infer<typeof zProductLd>;
|
||||
|
||||
export function getProductJsonLd(dom: Window): ProductLd {
|
||||
const ld = findJsonLd(dom, "Product");
|
||||
const productLd = zProductLd.parse(ld);
|
||||
return productLd;
|
||||
}
|
21
scraper/dia.ts
Normal file
21
scraper/dia.ts
Normal file
|
@ -0,0 +1,21 @@
|
|||
import { parseHTML } from "linkedom";
|
||||
import { type Precioish } from "./scrap.js";
|
||||
import { getMetaProp, getProductJsonLd, priceFromMeta } from "./common.js";
|
||||
|
||||
export function getDiaProduct(html: string | Buffer): Precioish {
|
||||
const dom = parseHTML(html);
|
||||
|
||||
const ean = getMetaProp(dom, "product:retailer_item_id");
|
||||
if (!ean) throw new Error("No encontré el ean");
|
||||
const precioCentavos = priceFromMeta(dom);
|
||||
|
||||
const ld = getProductJsonLd(dom);
|
||||
const inStock =
|
||||
ld.offers.offers[0].availability === "http://schema.org/InStock";
|
||||
|
||||
return {
|
||||
ean,
|
||||
precioCentavos,
|
||||
inStock,
|
||||
};
|
||||
}
|
|
@ -17,11 +17,13 @@
|
|||
"nanoid": "^5.0.4",
|
||||
"p-map": "^7.0.0",
|
||||
"undici": "^6.2.0",
|
||||
"warcio": "^2.2.1"
|
||||
"warcio": "^2.2.1",
|
||||
"zod": "^3.22.4"
|
||||
},
|
||||
"devDependencies": {
|
||||
"@types/better-sqlite3": "^7.6.8",
|
||||
"@types/node": "^20.10.5",
|
||||
"bun-types": "^1.0.19",
|
||||
"tsx": "^4.7.0",
|
||||
"typescript": "^5.3.3"
|
||||
}
|
||||
|
|
147
scraper/scrap.ts
147
scraper/scrap.ts
|
@ -1,128 +1,79 @@
|
|||
/// <reference lib="dom" />
|
||||
/// <reference lib="dom.iterable" />
|
||||
/// <reference types="node" />
|
||||
import { parseHTML } from "linkedom";
|
||||
import { drizzle } from "drizzle-orm/better-sqlite3";
|
||||
import Database from "better-sqlite3";
|
||||
import { Database } from "bun:sqlite";
|
||||
import { drizzle } from "drizzle-orm/bun-sqlite";
|
||||
import { precios } from "./db/schema.js";
|
||||
import { WARCParser } from "warcio";
|
||||
import { createReadStream } from "fs";
|
||||
import { createReadStream, createWriteStream } from "fs";
|
||||
import { writeFile } from "fs/promises";
|
||||
import { createHash } from "crypto";
|
||||
import { migrate } from "drizzle-orm/bun-sqlite/migrator";
|
||||
import { getCarrefourProduct } from "./carrefour.js";
|
||||
import { getDiaProduct } from "./dia.js";
|
||||
import { join } from "path";
|
||||
|
||||
const sqlite = new Database("sqlite.db");
|
||||
const db = drizzle(sqlite);
|
||||
|
||||
type Precio = typeof precios.$inferInsert;
|
||||
const DEBUG = true;
|
||||
|
||||
export type Precio = typeof precios.$inferInsert;
|
||||
export type Precioish = Omit<Precio, "fetchedAt" | "url">;
|
||||
|
||||
async function storePrecioPoint(point: Precio) {
|
||||
await db.insert(precios).values(point);
|
||||
}
|
||||
|
||||
function getEanByTable(dom: Window): string {
|
||||
const eanLabelEl = dom.window.document.querySelector(
|
||||
'td[data-specification="EAN"]'
|
||||
);
|
||||
const eanValueEl = eanLabelEl?.parentElement?.children[1];
|
||||
if (
|
||||
!eanValueEl ||
|
||||
!(eanValueEl instanceof dom.window.HTMLElement) ||
|
||||
!eanValueEl.dataset.specification
|
||||
)
|
||||
throw new Error("No encontré el EAN");
|
||||
return eanValueEl.dataset.specification;
|
||||
}
|
||||
|
||||
function parseJsonLds(dom: Window): object[] {
|
||||
const scripts = dom.window.document.querySelectorAll(
|
||||
'script[type="application/ld+json"]'
|
||||
);
|
||||
return [...scripts].map((scripts) => JSON.parse(scripts.innerHTML));
|
||||
}
|
||||
function findJsonLd(dom: Window, type: string): object | undefined {
|
||||
return parseJsonLds(dom).find((x) => "@type" in x && x["@type"] === type);
|
||||
}
|
||||
|
||||
function parseScriptJson<T>(dom: Window, varname: string): T {
|
||||
const script = dom.window.document.querySelector<HTMLTemplateElement>(
|
||||
`template[data-type="json"][data-varname="${varname}"]`
|
||||
)?.content?.children[0];
|
||||
if (!script) throw new Error("no encuentro el script");
|
||||
return JSON.parse(script.innerHTML);
|
||||
}
|
||||
|
||||
function eanFromSeedState(dom: Window): string {
|
||||
const json = parseScriptJson<object>(dom, "__STATE__");
|
||||
const productJson = Object.entries(json).find(
|
||||
([key, val]) => key.startsWith("Product:") && val.__typename === "Product"
|
||||
);
|
||||
if (!productJson) throw new Error("no encontré el product en el json");
|
||||
|
||||
const productSkuJson = Object.entries(json).find(
|
||||
([key, val]) =>
|
||||
key.startsWith(`Product:${productJson[1].cacheId}`) &&
|
||||
val.__typename === "SKU"
|
||||
);
|
||||
if (!productSkuJson) throw new Error("no encontré el sku en el json");
|
||||
return productSkuJson[1].ean;
|
||||
}
|
||||
|
||||
function eanFromDynamicYieldScript(dom: Window): string {
|
||||
const scriptEl = dom.window.document.querySelector(
|
||||
`script[src^="//st.dynamicyield.com/st?"]`
|
||||
);
|
||||
if (!scriptEl || !(scriptEl instanceof dom.window.HTMLScriptElement))
|
||||
throw new Error("no encuentro el script de dynamicyield");
|
||||
|
||||
const url = new URL(scriptEl.src);
|
||||
const ctx = url.searchParams.get("ctx");
|
||||
if (!ctx) throw new Error("no hay ctx");
|
||||
return JSON.parse(ctx).data[0];
|
||||
}
|
||||
|
||||
function getCarrefourProduct(html: string | Buffer): Precio {
|
||||
const dom = parseHTML(html);
|
||||
|
||||
const precioMeta = dom.window.document
|
||||
.querySelector(`meta[property="product:price:amount"]`)
|
||||
?.getAttribute("content");
|
||||
if (!precioMeta) throw new Error("No encontré el precio");
|
||||
const precioCentavos = parseFloat(precioMeta) * 100;
|
||||
|
||||
// const productLd = findJsonLd(dom, "Product");
|
||||
const ean = eanFromSeedState(dom);
|
||||
|
||||
return {
|
||||
ean,
|
||||
precioCentavos,
|
||||
fetchedAt: new Date(),
|
||||
};
|
||||
}
|
||||
|
||||
(async () => {
|
||||
// await migrate(db, { migrationsFolder: "./drizzle" });
|
||||
// const p = await getCarrefourProduct(
|
||||
// "https://www.carrefour.com.ar/bebida-lactea-la-serenisima-ultra-0-grasa-vainilla-900-cc/p"
|
||||
// );
|
||||
// await storePrecioPoint(p);
|
||||
const o = createWriteStream("x.tsv");
|
||||
o.write(`ean\tfetchedAt\tprecioCentavos\tinStock\turl\n`);
|
||||
|
||||
const warc = createReadStream(process.argv[2]);
|
||||
const parser = new WARCParser(warc);
|
||||
let progress = { done: 0, errors: 0 };
|
||||
for await (const record of parser) {
|
||||
if (record.warcType === "response") {
|
||||
if (!record.warcTargetURI) throw new Error("no uri");
|
||||
console.log(record.warcTargetURI);
|
||||
const html = await record.contentText();
|
||||
|
||||
const url = new URL(record.warcTargetURI);
|
||||
try {
|
||||
const product = getCarrefourProduct(html);
|
||||
console.log(product);
|
||||
let ish: Precioish | undefined = undefined;
|
||||
if (url.hostname === "www.carrefour.com.ar")
|
||||
ish = getCarrefourProduct(html);
|
||||
else if (url.hostname === "diaonline.supermercadosdia.com.ar")
|
||||
ish = getDiaProduct(html);
|
||||
else console.error(`Unknown host ${url.hostname}`);
|
||||
|
||||
const p: Precio = {
|
||||
...ish,
|
||||
fetchedAt: new Date(record.warcDate!),
|
||||
url: record.warcTargetURI,
|
||||
};
|
||||
|
||||
if (ish)
|
||||
o.write(
|
||||
`${p.ean}\t${p.fetchedAt}\t${p.precioCentavos}\t${p.inStock}\t${p.url}\n`
|
||||
);
|
||||
|
||||
// console.log(product);
|
||||
progress.done++;
|
||||
} catch (error) {
|
||||
console.error(error);
|
||||
const urlHash = createHash("md5")
|
||||
.update(record.warcTargetURI!)
|
||||
.digest("hex");
|
||||
const output = `${urlHash}.html`;
|
||||
await writeFile(output, html);
|
||||
console.error(`wrote html to ${output}`);
|
||||
progress.errors++;
|
||||
|
||||
if (DEBUG) {
|
||||
const urlHash = createHash("md5")
|
||||
.update(record.warcTargetURI!)
|
||||
.digest("hex");
|
||||
const output = join("debug", `${urlHash}.html`);
|
||||
await writeFile(output, html);
|
||||
console.error(`wrote html to ${output}`);
|
||||
}
|
||||
} finally {
|
||||
console.debug(progress);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
6
scraper/tsconfig.json
Normal file
6
scraper/tsconfig.json
Normal file
|
@ -0,0 +1,6 @@
|
|||
{
|
||||
"extends": "../tsconfig.json",
|
||||
"compilerOptions": {
|
||||
"types": ["bun-types"]
|
||||
}
|
||||
}
|
Loading…
Reference in a new issue