This commit is contained in:
Cat /dev/Nulo 2023-12-21 14:00:00 -03:00
commit 6082815e32
9 changed files with 1421 additions and 0 deletions

6
.gitignore vendored Normal file
View file

@ -0,0 +1,6 @@
data/carrefour
*.warc.gz
p.*
p
node_modules/
*.db

9
scraper/db/schema.ts Normal file
View file

@ -0,0 +1,9 @@
import { integer, sqliteTable, text } from "drizzle-orm/sqlite-core";
export const precios = sqliteTable("precios", {
id: integer("id", { mode: "number" }).primaryKey({ autoIncrement: true }),
fetchedAt: integer("fetched_at", { mode: "timestamp" }).notNull(),
precioCentavos: integer("precio_centavos").notNull(),
ean: text("ean").notNull(),
url: text("url"),
});

27
scraper/downloadUrls.ts Normal file
View file

@ -0,0 +1,27 @@
import { readFile, writeFile } from "fs/promises";
import pMap from "p-map";
import { nanoid } from "nanoid";
import { getHtml } from "./fetch.js";
import { join } from "path";
(async () => {
const inputPath = process.argv[2];
const outputPath = process.argv[3];
if (!inputPath || !outputPath) {
console.error("falta input y/o output");
process.exit(1);
}
const file = await readFile(inputPath, "utf-8");
const urls = file.split("\n");
await pMap(
urls,
async (url: string) => {
const id = nanoid();
const html = await getHtml(url);
await writeFile(join(outputPath, `${id}.link`), url);
await writeFile(join(outputPath, id), html);
},
{ concurrency: 12 }
);
})();

35
scraper/fetch.ts Normal file
View file

@ -0,0 +1,35 @@
import { request } from "undici";
import { createBrotliDecompress, createUnzip } from "node:zlib";
import { pipeline } from "node:stream/promises";
export async function getHtml(url: string) {
const res = await request(url, {
headers: {},
});
let output: Buffer;
switch (res.headers["content-encoding"]) {
case "gzip":
case "deflate":
output = await pipeline(res.body, createUnzip(), readableToBuffer);
break;
case "br":
output = await pipeline(
res.body,
createBrotliDecompress(),
readableToBuffer
);
break;
default:
output = await readableToBuffer(res.body);
}
return output;
}
async function readableToBuffer(source: AsyncIterable<any>) {
// https://stackoverflow.com/a/72891118
const buffers = [];
for await (const data of source) {
buffers.push(data);
}
return Buffer.concat(buffers);
}

29
scraper/package.json Normal file
View file

@ -0,0 +1,29 @@
{
"name": "scraper",
"type": "module",
"version": "1.0.0",
"description": "",
"main": "index.js",
"scripts": {
"test": "echo \"Error: no test specified\" && exit 1"
},
"keywords": [],
"author": "",
"license": "ISC",
"dependencies": {
"better-sqlite3": "^9.2.2",
"drizzle-orm": "^0.29.1",
"jsdom": "^23.0.1",
"nanoid": "^5.0.4",
"p-map": "^7.0.0",
"undici": "^6.2.0",
"warcio": "^2.2.1"
},
"devDependencies": {
"@types/better-sqlite3": "^7.6.8",
"@types/jsdom": "^21.1.6",
"@types/node": "^20.10.5",
"tsx": "^4.7.0",
"typescript": "^5.3.3"
}
}

1150
scraper/pnpm-lock.yaml Normal file

File diff suppressed because it is too large Load diff

66
scraper/scrap.ts Normal file
View file

@ -0,0 +1,66 @@
import { JSDOM } from "jsdom";
import { getHtml } from "./fetch.js";
import { drizzle } from "drizzle-orm/better-sqlite3";
import { migrate } from "drizzle-orm/better-sqlite3/migrator";
import Database from "better-sqlite3";
import { precios } from "./db/schema.js";
import { open } from "fs/promises";
import { WARCParser } from "warcio";
import { createReadStream } from "fs";
const sqlite = new Database("sqlite.db");
const db = drizzle(sqlite);
type Precio = typeof precios.$inferInsert;
async function storePrecioPoint(point: Precio) {
await db.insert(precios).values(point);
}
function getCarrefourProduct(html: string | Buffer): Precio {
const dom = new JSDOM(html);
const listPriceValueEl = dom.window.document.querySelector(
".valtech-carrefourar-product-price-0-x-listPriceValue"
);
const matches = listPriceValueEl?.textContent?.match(/([\d,]+)/);
if (!matches || !matches[1]) throw new Error("No encontré el precio");
const precio = parseFloat(matches[1].replace(",", "."));
const eanLabelEl = dom.window.document.querySelector(
'td[data-specification="EAN"]'
);
const eanValueEl = eanLabelEl?.parentElement?.children[1];
if (
!eanValueEl ||
!(eanValueEl instanceof dom.window.HTMLElement) ||
!eanValueEl.dataset.specification
)
throw new Error("No encontré el EAN");
const ean = eanValueEl.dataset.specification;
return {
ean,
precioCentavos: precio * 100,
fetchedAt: new Date(),
};
}
(async () => {
// await migrate(db, { migrationsFolder: "./drizzle" });
// const p = await getCarrefourProduct(
// "https://www.carrefour.com.ar/bebida-lactea-la-serenisima-ultra-0-grasa-vainilla-900-cc/p"
// );
// await storePrecioPoint(p);
const warc = createReadStream(process.argv[2]);
const parser = new WARCParser(warc);
for await (const record of parser) {
if (record.warcType === "response") {
console.log(record.warcTargetURI);
const html = await record.contentText();
const product = getCarrefourProduct(html);
console.log(product);
}
}
})();

19
scraper/tsconfig.json Normal file
View file

@ -0,0 +1,19 @@
{
"$schema": "https://json.schemastore.org/tsconfig",
"display": "Node 20",
"_version": "20.1.0",
"compilerOptions": {
"lib": ["es2023"],
"module": "NodeNext",
"target": "es2022",
"strict": true,
"skipLibCheck": true,
"esModuleInterop": true,
"checkJs": true,
"noEmit": true,
"forceConsistentCasingInFileNames": true
},
"include": ["**/*.ts", "**/*.js"]
}

80
userscripts/carrefour.js Normal file
View file

@ -0,0 +1,80 @@
// @ts-check
(async () => {
const getUrls = () => JSON.parse(localStorage.urls || "[]");
function nextPage() {
const btn = document.querySelector(
'button[type="button"].pointer svg path[d="M11.7364 8.29314C11.7643 8.22117 11.81 8.15746 11.8693 8.10803C11.9286 8.05859 11.9994 8.02505 12.0752 8.01054C12.151 7.99604 12.2293 8.00106 12.3026 8.02513C12.376 8.0492 12.442 8.09153 12.4944 8.14814L19.7834 15.4161C19.923 15.5769 19.9999 15.7827 19.9999 15.9956C19.9999 16.2086 19.923 16.4143 19.7834 16.5751L12.4944 23.8551C12.4527 23.8998 12.4024 23.9356 12.3466 23.9605C12.2908 23.9854 12.2305 23.9989 12.1694 24.0001C12.0778 23.9955 11.9891 23.9661 11.9128 23.915C11.8365 23.864 11.7756 23.7931 11.7364 23.7101C11.6511 23.5504 11.616 23.3687 11.6356 23.1887C11.6552 23.0086 11.7287 22.8387 11.8464 22.7011L18.5574 16.0011L11.8464 9.29514C11.7279 9.15936 11.6538 8.99054 11.6342 8.8114C11.6145 8.63226 11.6502 8.45139 11.7364 8.29314Z"]'
)?.parentElement?.parentElement?.parentElement;
btn?.click();
return !!btn;
}
const { h, render } = await import("https://esm.sh/preact");
const { default: htm } = await import("https://esm.sh/htm");
const { useState } = await import("https://esm.sh/preact/hooks");
const html = htm.bind(h);
function Dashboard() {
const [autoNextPageInterval, setAutoNextPageInterval] = useState(null);
function toggleAutoNextPageInterval() {
if (autoNextPageInterval === null) {
let lastRun = getProductUrlsInPage();
nextPage();
const i = setInterval(async () => {
if (
getProductUrlsInPage().filter((u) => lastRun.includes(u)).length ===
0
) {
saveProductUrlsInPage();
lastRun = getProductUrlsInPage();
if (!nextPage()) {
clearInterval(i);
setAutoNextPageInterval(null);
}
}
}, 100);
setAutoNextPageInterval(i);
} else {
clearInterval(autoNextPageInterval);
setAutoNextPageInterval(null);
}
}
return html`<div>
<button onClick=${copyUrls}>copy urls</button>
<button onClick=${toggleAutoNextPageInterval}>
${autoNextPageInterval === null ? "empezar autopage" : "parar autopage"}
</button>
</div>`;
}
const dashboardEl = document.createElement("div");
// @ts-ignore
dashboardEl.style = `
position: fixed;
bottom: 0; right: 0;
`;
render(html`<${Dashboard} />`, dashboardEl);
document.body.appendChild(dashboardEl);
function copyUrls() {
navigator.clipboard.writeText(getUrls().join("\n"));
}
function getProductUrlsInPage() {
return [...document.querySelectorAll(".vtex-product-summary-2-x-clearLink")]
.map((el) => el instanceof HTMLAnchorElement && el.href)
.filter(/** @returns {x is string} */ (x) => !!x);
}
function saveProductUrlsInPage() {
const urlSet = new Set(getUrls());
getProductUrlsInPage().forEach((url) => urlSet.add(url));
localStorage.urls = JSON.stringify([...urlSet]);
}
const i = setInterval(() => {
saveProductUrlsInPage();
}, 500);
})();