Compare commits

..

No commits in common. "925175ba9d85df7e6718d046d514917314ce34f8" and "8ccd69de1bb46ec014137b623b7f2458b581134a" have entirely different histories.

24 changed files with 402 additions and 355 deletions

BIN
bun.lockb

Binary file not shown.

View file

@ -1,2 +0,0 @@
ALTER TABLE precios ADD `name` text;--> statement-breakpoint
ALTER TABLE precios ADD `image_url` text;

View file

@ -1,93 +0,0 @@
{
"version": "5",
"dialect": "sqlite",
"id": "cbd90a60-7568-489f-ac45-95bd8818ffbd",
"prevId": "a565621c-046e-4f4d-b505-104e2c4f2b6c",
"tables": {
"precios": {
"name": "precios",
"columns": {
"id": {
"name": "id",
"type": "integer",
"primaryKey": true,
"notNull": true,
"autoincrement": true
},
"ean": {
"name": "ean",
"type": "text",
"primaryKey": false,
"notNull": true,
"autoincrement": false
},
"fetched_at": {
"name": "fetched_at",
"type": "integer",
"primaryKey": false,
"notNull": true,
"autoincrement": false
},
"precio_centavos": {
"name": "precio_centavos",
"type": "integer",
"primaryKey": false,
"notNull": false,
"autoincrement": false
},
"in_stock": {
"name": "in_stock",
"type": "integer",
"primaryKey": false,
"notNull": false,
"autoincrement": false
},
"url": {
"name": "url",
"type": "text",
"primaryKey": false,
"notNull": true,
"autoincrement": false
},
"warc_record_id": {
"name": "warc_record_id",
"type": "text",
"primaryKey": false,
"notNull": false,
"autoincrement": false
},
"parser_version": {
"name": "parser_version",
"type": "integer",
"primaryKey": false,
"notNull": false,
"autoincrement": false
},
"name": {
"name": "name",
"type": "text",
"primaryKey": false,
"notNull": false,
"autoincrement": false
},
"image_url": {
"name": "image_url",
"type": "text",
"primaryKey": false,
"notNull": false,
"autoincrement": false
}
},
"indexes": {},
"foreignKeys": {},
"compositePrimaryKeys": {},
"uniqueConstraints": {}
}
},
"enums": {},
"_meta": {
"schemas": {},
"tables": {},
"columns": {}
}
}

View file

@ -15,13 +15,6 @@
"when": 1703374278842, "when": 1703374278842,
"tag": "0001_spotty_red_hulk", "tag": "0001_spotty_red_hulk",
"breakpoints": true "breakpoints": true
},
{
"idx": 2,
"version": "5",
"when": 1703452301821,
"tag": "0002_wild_amazoness",
"breakpoints": true
} }
] ]
} }

View file

@ -1,11 +1,11 @@
import Database from "bun:sqlite"; import { migrate } from "drizzle-orm/better-sqlite3/migrator";
import { drizzle } from "drizzle-orm/bun-sqlite"; import Database from "better-sqlite3";
import { migrate } from "drizzle-orm/bun-sqlite/migrator"; import { drizzle } from "drizzle-orm/better-sqlite3";
import * as schema from "./schema.js"; import * as schema from "./schema.js";
const sqlite = new Database("../scraper/sqlite.db"); const sqlite = new Database("../scraper/sqlite.db");
const db = drizzle(sqlite, { schema }); const db = drizzle(sqlite, { schema });
migrate(db, { migrationsFolder: "./drizzle" }); await migrate(db, { migrationsFolder: "./drizzle" });
sqlite.close(); sqlite.close();

View file

@ -5,8 +5,7 @@
"description": "", "description": "",
"main": "index.js", "main": "index.js",
"scripts": { "scripts": {
"migrate": "bun migrate.ts", "migrate": "node --import tsx/esm migrate.ts"
"generate": "drizzle-kit generate:sqlite"
}, },
"keywords": [], "keywords": [],
"author": "", "author": "",

View file

@ -9,8 +9,6 @@ export const precios = sqliteTable("precios", {
url: text("url").notNull(), url: text("url").notNull(),
warcRecordId: text("warc_record_id"), warcRecordId: text("warc_record_id"),
parserVersion: integer("parser_version"), parserVersion: integer("parser_version"),
name: text("name"),
imageUrl: text("image_url"),
}); });
export type Precio = typeof precios.$inferSelect; export type Precio = typeof precios.$inferSelect;

View file

@ -1,11 +0,0 @@
{
"name": "preciazo",
"private": true,
"workspaces": [
"dia-link-scraper",
"coto-link-scraper",
"scraper",
"sitio",
"db-datos"
]
}

File diff suppressed because it is too large Load diff

6
pnpm-workspace.yaml Normal file
View file

@ -0,0 +1,6 @@
packages:
- "dia-link-scraper"
- "coto-link-scraper"
- "scraper"
- "sitio"
- "db-datos"

40
scraper/bench.ts Normal file
View file

@ -0,0 +1,40 @@
// import { run, bench, group, baseline } from "mitata";
import { createReadStream } from "node:fs";
import { Writable } from "node:stream";
import { pipeline } from "node:stream/promises";
import { getCarrefourProduct } from "./carrefour.js";
import { WARCParser } from "warcio";
// import { ZSTDDecompress } from "simple-zstd";
// import { AutoWARCParser } from "node-warc";
// const html = await readFile("carrefour.html", "utf-8");
// bench("carrefour", () => {
// getCarrefourProduct(html);
// });
// await bench("warcio", async () => {
// const warc = Bun.spawn(
// ["zstd", "-do", "/dev/stdout", "../data/carrefour.warc.zst"],
// {
// // stdin: Bun.file().stream(),
// }
// ).stdout;
// // const warc = Bun.file("../data/carrefour.warc").stream(1024 * 1024 * 512);
// // const warc = createReadStream("../data/carrefour.warc.zst").pipe(ZSTDDecompress());
// const parser = new WARCParser(warc);
// for await (const record of parser) {
// const html = await record.contentText();
// }
// });
// await bench("warc", );
async function bench(name: string, func: () => Promise<void>) {
const t0 = performance.now();
await func();
const t1 = performance.now();
console.debug(`${name} took ${t1 - t0}`);
}
// await run({});

View file

@ -1,6 +1,6 @@
import { parseHTML } from "linkedom"; import { parseHTML } from "linkedom";
import { Precioish, type Precio } from "../scrap.js"; import { Precioish, type Precio } from "./scrap.js";
import { getProductJsonLd, priceFromMeta } from "../common.js"; import { getProductJsonLd, priceFromMeta } from "./common.js";
function getEanByTable(dom: Window): string { function getEanByTable(dom: Window): string {
const eanLabelEl = dom.window.document.querySelector( const eanLabelEl = dom.window.document.querySelector(
@ -55,17 +55,14 @@ export function getCarrefourProduct(html: string | Buffer): Precioish {
const precioCentavos = priceFromMeta(dom); const precioCentavos = priceFromMeta(dom);
// const productLd = findJsonLd(dom, "Product");
const ean = eanFromSeedState(dom); const ean = eanFromSeedState(dom);
const ld = getProductJsonLd(dom); const ld = getProductJsonLd(dom);
const name = ld.name;
const imageUrl = ld.image;
const inStock = const inStock =
ld.offers.offers[0].availability === "http://schema.org/InStock"; ld.offers.offers[0].availability === "http://schema.org/InStock";
return { return {
name,
imageUrl,
ean, ean,
precioCentavos, precioCentavos,
inStock, inStock,

View file

@ -1,5 +1,5 @@
import { parseHTML } from "linkedom"; import { parseHTML } from "linkedom";
import { type Precioish } from "../scrap.js"; import { type Precioish } from "./scrap.js";
function getEanFromText({ document }: Window) { function getEanFromText({ document }: Window) {
const potentialEanEls = Array.from( const potentialEanEls = Array.from(
@ -34,10 +34,5 @@ export function getCotoProduct(html: string | Buffer): Precioish {
const ean = getEanFromText(dom); const ean = getEanFromText(dom);
const precioCentavos = getPriceFromText(dom); const precioCentavos = getPriceFromText(dom);
const name = dom.document.querySelector("h1.product_page")?.textContent; return { ean, precioCentavos };
const imageUrl = dom.document.querySelector<HTMLImageElement>(
".productImageZoom img"
)?.src;
return { name, imageUrl, ean, precioCentavos };
} }

View file

@ -1,6 +1,6 @@
import { parseHTML } from "linkedom"; import { parseHTML } from "linkedom";
import { type Precioish } from "../scrap.js"; import { type Precioish } from "./scrap.js";
import { getMetaProp, getProductJsonLd, priceFromMeta } from "../common.js"; import { getMetaProp, getProductJsonLd, priceFromMeta } from "./common.js";
export function getDiaProduct(html: string | Buffer): Precioish { export function getDiaProduct(html: string | Buffer): Precioish {
const dom = parseHTML(html); const dom = parseHTML(html);
@ -10,14 +10,10 @@ export function getDiaProduct(html: string | Buffer): Precioish {
const precioCentavos = priceFromMeta(dom); const precioCentavos = priceFromMeta(dom);
const ld = getProductJsonLd(dom); const ld = getProductJsonLd(dom);
const name = ld.name;
const imageUrl = ld.image;
const inStock = const inStock =
ld.offers.offers[0].availability === "http://schema.org/InStock"; ld.offers.offers[0].availability === "http://schema.org/InStock";
return { return {
name,
imageUrl,
ean, ean,
precioCentavos, precioCentavos,
inStock, inStock,

27
scraper/downloadUrls.ts Normal file
View file

@ -0,0 +1,27 @@
import { readFile, writeFile } from "fs/promises";
import pMap from "p-map";
import { nanoid } from "nanoid";
import { getHtml } from "./fetch.js";
import { join } from "path";
(async () => {
const inputPath = process.argv[2];
const outputPath = process.argv[3];
if (!inputPath || !outputPath) {
console.error("falta input y/o output");
process.exit(1);
}
const file = await readFile(inputPath, "utf-8");
const urls = file.split("\n");
await pMap(
urls,
async (url: string) => {
const id = nanoid();
const html = await getHtml(url);
await writeFile(join(outputPath, `${id}.link`), url);
await writeFile(join(outputPath, id), html);
},
{ concurrency: 12 }
);
})();

View file

@ -1,18 +1,20 @@
/// <reference lib="dom" />
/// <reference lib="dom.iterable" />
import { Database } from "bun:sqlite"; import { Database } from "bun:sqlite";
import { drizzle } from "drizzle-orm/bun-sqlite"; import { drizzle } from "drizzle-orm/bun-sqlite";
import * as schema from "db-datos/schema.js"; import * as schema from "db-datos/schema.js";
import { WARCParser } from "warcio"; import { WARCParser } from "warcio";
import { writeFile } from "fs/promises"; import { writeFile } from "fs/promises";
import { createHash } from "crypto"; import { createHash } from "crypto";
import { getCarrefourProduct } from "./parsers/carrefour.js"; import { getCarrefourProduct } from "./carrefour.js";
import { getDiaProduct } from "./parsers/dia.js"; import { getDiaProduct } from "./dia.js";
import { getCotoProduct } from "./parsers/coto.js"; import { getCotoProduct } from "./coto.js";
import { join } from "path"; import { join } from "path";
import pMap from "p-map"; import pMap from "p-map";
import { and, eq, sql } from "drizzle-orm"; import { and, eq, sql } from "drizzle-orm";
const DEBUG = false; const DEBUG = false;
const PARSER_VERSION = 2; const PARSER_VERSION = 1;
const sqlite = new Database("sqlite.db"); const sqlite = new Database("sqlite.db");
const db = drizzle(sqlite, { schema }); const db = drizzle(sqlite, { schema });

View file

@ -1,10 +0,0 @@
FROM docker.io/oven/bun:1-alpine
COPY build/ .
RUN bun i
EXPOSE 3000
# https://github.com/gornostay25/svelte-adapter-bun/issues/39
ENV PROTOCOL_HEADER=x-forwarded-proto
ENV HOST_HEADER=x-forwarded-host
CMD ["bun", "run", "start"]

View file

@ -12,9 +12,10 @@
"format": "prettier --write ." "format": "prettier --write ."
}, },
"devDependencies": { "devDependencies": {
"@sveltejs/adapter-node": "^2.0.2",
"@sveltejs/kit": "^2.0.0", "@sveltejs/kit": "^2.0.0",
"@sveltejs/vite-plugin-svelte": "^3.0.0", "@sveltejs/vite-plugin-svelte": "^3.0.0",
"@types/bun": "^1.0.0", "@types/better-sqlite3": "^7.6.8",
"autoprefixer": "^10.4.16", "autoprefixer": "^10.4.16",
"db-datos": "workspace:^", "db-datos": "workspace:^",
"postcss": "^8.4.32", "postcss": "^8.4.32",
@ -23,7 +24,6 @@
"prettier-plugin-svelte": "^3.1.2", "prettier-plugin-svelte": "^3.1.2",
"prettier-plugin-tailwindcss": "^0.5.9", "prettier-plugin-tailwindcss": "^0.5.9",
"svelte": "^4.2.7", "svelte": "^4.2.7",
"svelte-adapter-bun": "^0.5.1",
"svelte-check": "^3.6.0", "svelte-check": "^3.6.0",
"tailwindcss": "^3.3.6", "tailwindcss": "^3.3.6",
"tslib": "^2.4.1", "tslib": "^2.4.1",
@ -32,6 +32,7 @@
}, },
"type": "module", "type": "module",
"dependencies": { "dependencies": {
"better-sqlite3": "^9.2.2",
"chart.js": "^4.4.1", "chart.js": "^4.4.1",
"chartjs-adapter-dayjs-4": "^1.0.4", "chartjs-adapter-dayjs-4": "^1.0.4",
"dayjs": "^1.11.10", "dayjs": "^1.11.10",

View file

@ -1,5 +1,5 @@
import Database from "bun:sqlite"; import Database from "better-sqlite3";
import { drizzle } from "drizzle-orm/bun-sqlite"; import { drizzle } from "drizzle-orm/better-sqlite3";
import * as schema from "db-datos/schema.js"; import * as schema from "db-datos/schema.js";
import { env } from "$env/dynamic/private"; import { env } from "$env/dynamic/private";

View file

@ -1,17 +1,18 @@
import { error } from "@sveltejs/kit"; import { error } from "@sveltejs/kit";
import type { PageServerLoad } from "./$types"; import type { PageServerLoad } from "./$types";
import { db, schema } from "$lib/server/db"; import { db, schema } from "$lib/server/db";
const { precios } = schema; import { ilike, like, sql } from "drizzle-orm";
import { sql } from "drizzle-orm";
export const load: PageServerLoad = async ({ params }) => { export const load: PageServerLoad = async ({ params }) => {
const q = db const q = db
.select({ ean: precios.ean, name: precios.name }) .select({ ean: schema.precios.ean })
.from(precios) .from(schema.precios)
.groupBy(precios.ean) .where(
.having(sql`max(length(name))`) like(schema.precios.url, `https://diaonline.supermercadosdia.com.ar%`),
)
.groupBy(schema.precios.ean)
.orderBy(sql`random()`) .orderBy(sql`random()`)
.limit(150); .limit(150);
const res = await q; const precios = await q;
return { precios: res }; return { precios };
}; };

View file

@ -10,7 +10,7 @@
{#each data.precios as product} {#each data.precios as product}
<li> <li>
<a href={`/ean/${product.ean}`}> <a href={`/ean/${product.ean}`}>
{product.name} {product.ean}
</a> </a>
</li> </li>
{/each} {/each}

View file

@ -1,20 +1,13 @@
import { error } from "@sveltejs/kit"; import { error } from "@sveltejs/kit";
import { eq, max } from "drizzle-orm"; import { eq } from "drizzle-orm";
import type { PageServerLoad } from "./$types"; import type { PageServerLoad } from "./$types";
import { db, schema } from "$lib/server/db"; import { db, schema } from "$lib/server/db";
const { precios } = schema;
export const load: PageServerLoad = async ({ params }) => { export const load: PageServerLoad = async ({ params }) => {
const q = db const precios = await db.query.precios.findMany({
.select() where: eq(schema.precios.ean, params.ean),
.from(precios) });
.where(eq(precios.ean, params.ean)) if (precios.length === 0) return error(404, "Not Found");
.groupBy(precios.warcRecordId)
.having(max(precios.parserVersion));
const res = await q;
if (res.length === 0) return error(404, "Not Found");
const meta = res.find((p) => p.name); return { precios };
return { precios: res, meta };
}; };

View file

@ -5,11 +5,6 @@
export let data: PageData; export let data: PageData;
</script> </script>
{#if data.meta}
<h1 class="text-3xl font-bold">{data.meta.name}</h1>
<img src={data.meta.imageUrl} class="max-h-48" />
{/if}
<ul> <ul>
{#each data.precios as precio} {#each data.precios as precio}
<li> <li>

View file

@ -1,5 +1,5 @@
// import adapter from "@sveltejs/adapter-node"; import adapter from "@sveltejs/adapter-node";
import adapter from "svelte-adapter-bun"; // import adapter from "svelte-adapter-bun";
import { vitePreprocess } from "@sveltejs/vite-plugin-svelte"; import { vitePreprocess } from "@sveltejs/vite-plugin-svelte";
/** @type {import('@sveltejs/kit').Config} */ /** @type {import('@sveltejs/kit').Config} */