Compare commits

..

No commits in common. "925175ba9d85df7e6718d046d514917314ce34f8" and "8ccd69de1bb46ec014137b623b7f2458b581134a" have entirely different histories.

24 changed files with 402 additions and 355 deletions

BIN
bun.lockb

Binary file not shown.

View file

@ -1,2 +0,0 @@
ALTER TABLE precios ADD `name` text;--> statement-breakpoint
ALTER TABLE precios ADD `image_url` text;

View file

@ -1,93 +0,0 @@
{
"version": "5",
"dialect": "sqlite",
"id": "cbd90a60-7568-489f-ac45-95bd8818ffbd",
"prevId": "a565621c-046e-4f4d-b505-104e2c4f2b6c",
"tables": {
"precios": {
"name": "precios",
"columns": {
"id": {
"name": "id",
"type": "integer",
"primaryKey": true,
"notNull": true,
"autoincrement": true
},
"ean": {
"name": "ean",
"type": "text",
"primaryKey": false,
"notNull": true,
"autoincrement": false
},
"fetched_at": {
"name": "fetched_at",
"type": "integer",
"primaryKey": false,
"notNull": true,
"autoincrement": false
},
"precio_centavos": {
"name": "precio_centavos",
"type": "integer",
"primaryKey": false,
"notNull": false,
"autoincrement": false
},
"in_stock": {
"name": "in_stock",
"type": "integer",
"primaryKey": false,
"notNull": false,
"autoincrement": false
},
"url": {
"name": "url",
"type": "text",
"primaryKey": false,
"notNull": true,
"autoincrement": false
},
"warc_record_id": {
"name": "warc_record_id",
"type": "text",
"primaryKey": false,
"notNull": false,
"autoincrement": false
},
"parser_version": {
"name": "parser_version",
"type": "integer",
"primaryKey": false,
"notNull": false,
"autoincrement": false
},
"name": {
"name": "name",
"type": "text",
"primaryKey": false,
"notNull": false,
"autoincrement": false
},
"image_url": {
"name": "image_url",
"type": "text",
"primaryKey": false,
"notNull": false,
"autoincrement": false
}
},
"indexes": {},
"foreignKeys": {},
"compositePrimaryKeys": {},
"uniqueConstraints": {}
}
},
"enums": {},
"_meta": {
"schemas": {},
"tables": {},
"columns": {}
}
}

View file

@ -15,13 +15,6 @@
"when": 1703374278842,
"tag": "0001_spotty_red_hulk",
"breakpoints": true
},
{
"idx": 2,
"version": "5",
"when": 1703452301821,
"tag": "0002_wild_amazoness",
"breakpoints": true
}
]
}

View file

@ -1,11 +1,11 @@
import Database from "bun:sqlite";
import { drizzle } from "drizzle-orm/bun-sqlite";
import { migrate } from "drizzle-orm/bun-sqlite/migrator";
import { migrate } from "drizzle-orm/better-sqlite3/migrator";
import Database from "better-sqlite3";
import { drizzle } from "drizzle-orm/better-sqlite3";
import * as schema from "./schema.js";
const sqlite = new Database("../scraper/sqlite.db");
const db = drizzle(sqlite, { schema });
migrate(db, { migrationsFolder: "./drizzle" });
await migrate(db, { migrationsFolder: "./drizzle" });
sqlite.close();

View file

@ -5,8 +5,7 @@
"description": "",
"main": "index.js",
"scripts": {
"migrate": "bun migrate.ts",
"generate": "drizzle-kit generate:sqlite"
"migrate": "node --import tsx/esm migrate.ts"
},
"keywords": [],
"author": "",

View file

@ -9,8 +9,6 @@ export const precios = sqliteTable("precios", {
url: text("url").notNull(),
warcRecordId: text("warc_record_id"),
parserVersion: integer("parser_version"),
name: text("name"),
imageUrl: text("image_url"),
});
export type Precio = typeof precios.$inferSelect;

View file

@ -1,11 +0,0 @@
{
"name": "preciazo",
"private": true,
"workspaces": [
"dia-link-scraper",
"coto-link-scraper",
"scraper",
"sitio",
"db-datos"
]
}

File diff suppressed because it is too large Load diff

6
pnpm-workspace.yaml Normal file
View file

@ -0,0 +1,6 @@
packages:
- "dia-link-scraper"
- "coto-link-scraper"
- "scraper"
- "sitio"
- "db-datos"

40
scraper/bench.ts Normal file
View file

@ -0,0 +1,40 @@
// import { run, bench, group, baseline } from "mitata";
import { createReadStream } from "node:fs";
import { Writable } from "node:stream";
import { pipeline } from "node:stream/promises";
import { getCarrefourProduct } from "./carrefour.js";
import { WARCParser } from "warcio";
// import { ZSTDDecompress } from "simple-zstd";
// import { AutoWARCParser } from "node-warc";
// const html = await readFile("carrefour.html", "utf-8");
// bench("carrefour", () => {
// getCarrefourProduct(html);
// });
// await bench("warcio", async () => {
// const warc = Bun.spawn(
// ["zstd", "-do", "/dev/stdout", "../data/carrefour.warc.zst"],
// {
// // stdin: Bun.file().stream(),
// }
// ).stdout;
// // const warc = Bun.file("../data/carrefour.warc").stream(1024 * 1024 * 512);
// // const warc = createReadStream("../data/carrefour.warc.zst").pipe(ZSTDDecompress());
// const parser = new WARCParser(warc);
// for await (const record of parser) {
// const html = await record.contentText();
// }
// });
// await bench("warc", );
async function bench(name: string, func: () => Promise<void>) {
const t0 = performance.now();
await func();
const t1 = performance.now();
console.debug(`${name} took ${t1 - t0}`);
}
// await run({});

View file

@ -1,6 +1,6 @@
import { parseHTML } from "linkedom";
import { Precioish, type Precio } from "../scrap.js";
import { getProductJsonLd, priceFromMeta } from "../common.js";
import { Precioish, type Precio } from "./scrap.js";
import { getProductJsonLd, priceFromMeta } from "./common.js";
function getEanByTable(dom: Window): string {
const eanLabelEl = dom.window.document.querySelector(
@ -55,17 +55,14 @@ export function getCarrefourProduct(html: string | Buffer): Precioish {
const precioCentavos = priceFromMeta(dom);
// const productLd = findJsonLd(dom, "Product");
const ean = eanFromSeedState(dom);
const ld = getProductJsonLd(dom);
const name = ld.name;
const imageUrl = ld.image;
const inStock =
ld.offers.offers[0].availability === "http://schema.org/InStock";
return {
name,
imageUrl,
ean,
precioCentavos,
inStock,

View file

@ -1,5 +1,5 @@
import { parseHTML } from "linkedom";
import { type Precioish } from "../scrap.js";
import { type Precioish } from "./scrap.js";
function getEanFromText({ document }: Window) {
const potentialEanEls = Array.from(
@ -34,10 +34,5 @@ export function getCotoProduct(html: string | Buffer): Precioish {
const ean = getEanFromText(dom);
const precioCentavos = getPriceFromText(dom);
const name = dom.document.querySelector("h1.product_page")?.textContent;
const imageUrl = dom.document.querySelector<HTMLImageElement>(
".productImageZoom img"
)?.src;
return { name, imageUrl, ean, precioCentavos };
return { ean, precioCentavos };
}

View file

@ -1,6 +1,6 @@
import { parseHTML } from "linkedom";
import { type Precioish } from "../scrap.js";
import { getMetaProp, getProductJsonLd, priceFromMeta } from "../common.js";
import { type Precioish } from "./scrap.js";
import { getMetaProp, getProductJsonLd, priceFromMeta } from "./common.js";
export function getDiaProduct(html: string | Buffer): Precioish {
const dom = parseHTML(html);
@ -10,14 +10,10 @@ export function getDiaProduct(html: string | Buffer): Precioish {
const precioCentavos = priceFromMeta(dom);
const ld = getProductJsonLd(dom);
const name = ld.name;
const imageUrl = ld.image;
const inStock =
ld.offers.offers[0].availability === "http://schema.org/InStock";
return {
name,
imageUrl,
ean,
precioCentavos,
inStock,

27
scraper/downloadUrls.ts Normal file
View file

@ -0,0 +1,27 @@
import { readFile, writeFile } from "fs/promises";
import pMap from "p-map";
import { nanoid } from "nanoid";
import { getHtml } from "./fetch.js";
import { join } from "path";
(async () => {
const inputPath = process.argv[2];
const outputPath = process.argv[3];
if (!inputPath || !outputPath) {
console.error("falta input y/o output");
process.exit(1);
}
const file = await readFile(inputPath, "utf-8");
const urls = file.split("\n");
await pMap(
urls,
async (url: string) => {
const id = nanoid();
const html = await getHtml(url);
await writeFile(join(outputPath, `${id}.link`), url);
await writeFile(join(outputPath, id), html);
},
{ concurrency: 12 }
);
})();

View file

@ -1,18 +1,20 @@
/// <reference lib="dom" />
/// <reference lib="dom.iterable" />
import { Database } from "bun:sqlite";
import { drizzle } from "drizzle-orm/bun-sqlite";
import * as schema from "db-datos/schema.js";
import { WARCParser } from "warcio";
import { writeFile } from "fs/promises";
import { createHash } from "crypto";
import { getCarrefourProduct } from "./parsers/carrefour.js";
import { getDiaProduct } from "./parsers/dia.js";
import { getCotoProduct } from "./parsers/coto.js";
import { getCarrefourProduct } from "./carrefour.js";
import { getDiaProduct } from "./dia.js";
import { getCotoProduct } from "./coto.js";
import { join } from "path";
import pMap from "p-map";
import { and, eq, sql } from "drizzle-orm";
const DEBUG = false;
const PARSER_VERSION = 2;
const PARSER_VERSION = 1;
const sqlite = new Database("sqlite.db");
const db = drizzle(sqlite, { schema });

View file

@ -1,10 +0,0 @@
FROM docker.io/oven/bun:1-alpine
COPY build/ .
RUN bun i
EXPOSE 3000
# https://github.com/gornostay25/svelte-adapter-bun/issues/39
ENV PROTOCOL_HEADER=x-forwarded-proto
ENV HOST_HEADER=x-forwarded-host
CMD ["bun", "run", "start"]

View file

@ -12,9 +12,10 @@
"format": "prettier --write ."
},
"devDependencies": {
"@sveltejs/adapter-node": "^2.0.2",
"@sveltejs/kit": "^2.0.0",
"@sveltejs/vite-plugin-svelte": "^3.0.0",
"@types/bun": "^1.0.0",
"@types/better-sqlite3": "^7.6.8",
"autoprefixer": "^10.4.16",
"db-datos": "workspace:^",
"postcss": "^8.4.32",
@ -23,7 +24,6 @@
"prettier-plugin-svelte": "^3.1.2",
"prettier-plugin-tailwindcss": "^0.5.9",
"svelte": "^4.2.7",
"svelte-adapter-bun": "^0.5.1",
"svelte-check": "^3.6.0",
"tailwindcss": "^3.3.6",
"tslib": "^2.4.1",
@ -32,6 +32,7 @@
},
"type": "module",
"dependencies": {
"better-sqlite3": "^9.2.2",
"chart.js": "^4.4.1",
"chartjs-adapter-dayjs-4": "^1.0.4",
"dayjs": "^1.11.10",

View file

@ -1,5 +1,5 @@
import Database from "bun:sqlite";
import { drizzle } from "drizzle-orm/bun-sqlite";
import Database from "better-sqlite3";
import { drizzle } from "drizzle-orm/better-sqlite3";
import * as schema from "db-datos/schema.js";
import { env } from "$env/dynamic/private";

View file

@ -1,17 +1,18 @@
import { error } from "@sveltejs/kit";
import type { PageServerLoad } from "./$types";
import { db, schema } from "$lib/server/db";
const { precios } = schema;
import { sql } from "drizzle-orm";
import { ilike, like, sql } from "drizzle-orm";
export const load: PageServerLoad = async ({ params }) => {
const q = db
.select({ ean: precios.ean, name: precios.name })
.from(precios)
.groupBy(precios.ean)
.having(sql`max(length(name))`)
.select({ ean: schema.precios.ean })
.from(schema.precios)
.where(
like(schema.precios.url, `https://diaonline.supermercadosdia.com.ar%`),
)
.groupBy(schema.precios.ean)
.orderBy(sql`random()`)
.limit(150);
const res = await q;
return { precios: res };
const precios = await q;
return { precios };
};

View file

@ -10,7 +10,7 @@
{#each data.precios as product}
<li>
<a href={`/ean/${product.ean}`}>
{product.name}
{product.ean}
</a>
</li>
{/each}

View file

@ -1,20 +1,13 @@
import { error } from "@sveltejs/kit";
import { eq, max } from "drizzle-orm";
import { eq } from "drizzle-orm";
import type { PageServerLoad } from "./$types";
import { db, schema } from "$lib/server/db";
const { precios } = schema;
export const load: PageServerLoad = async ({ params }) => {
const q = db
.select()
.from(precios)
.where(eq(precios.ean, params.ean))
.groupBy(precios.warcRecordId)
.having(max(precios.parserVersion));
const res = await q;
if (res.length === 0) return error(404, "Not Found");
const precios = await db.query.precios.findMany({
where: eq(schema.precios.ean, params.ean),
});
if (precios.length === 0) return error(404, "Not Found");
const meta = res.find((p) => p.name);
return { precios: res, meta };
return { precios };
};

View file

@ -5,11 +5,6 @@
export let data: PageData;
</script>
{#if data.meta}
<h1 class="text-3xl font-bold">{data.meta.name}</h1>
<img src={data.meta.imageUrl} class="max-h-48" />
{/if}
<ul>
{#each data.precios as precio}
<li>

View file

@ -1,5 +1,5 @@
// import adapter from "@sveltejs/adapter-node";
import adapter from "svelte-adapter-bun";
import adapter from "@sveltejs/adapter-node";
// import adapter from "svelte-adapter-bun";
import { vitePreprocess } from "@sveltejs/vite-plugin-svelte";
/** @type {import('@sveltejs/kit').Config} */