Compare commits

..

6 commits

Author SHA1 Message Date
925175ba9d parsear name y imageUrl 2023-12-24 19:21:51 -03:00
08f62c78db usar bun para db-datos 2023-12-24 18:12:06 -03:00
794247d657 limpiar scraper 2023-12-24 18:06:52 -03:00
f2f5c7afdd usar bun install + sitio container 2023-12-24 17:57:45 -03:00
f5cc28f44b sitio: compilar a bun 2023-12-24 13:42:45 -03:00
3a6fe55da4 volver a bun para sitio 2023-12-24 13:31:00 -03:00
24 changed files with 355 additions and 402 deletions

BIN
bun.lockb Executable file

Binary file not shown.

View file

@ -0,0 +1,2 @@
ALTER TABLE precios ADD `name` text;--> statement-breakpoint
ALTER TABLE precios ADD `image_url` text;

View file

@ -0,0 +1,93 @@
{
"version": "5",
"dialect": "sqlite",
"id": "cbd90a60-7568-489f-ac45-95bd8818ffbd",
"prevId": "a565621c-046e-4f4d-b505-104e2c4f2b6c",
"tables": {
"precios": {
"name": "precios",
"columns": {
"id": {
"name": "id",
"type": "integer",
"primaryKey": true,
"notNull": true,
"autoincrement": true
},
"ean": {
"name": "ean",
"type": "text",
"primaryKey": false,
"notNull": true,
"autoincrement": false
},
"fetched_at": {
"name": "fetched_at",
"type": "integer",
"primaryKey": false,
"notNull": true,
"autoincrement": false
},
"precio_centavos": {
"name": "precio_centavos",
"type": "integer",
"primaryKey": false,
"notNull": false,
"autoincrement": false
},
"in_stock": {
"name": "in_stock",
"type": "integer",
"primaryKey": false,
"notNull": false,
"autoincrement": false
},
"url": {
"name": "url",
"type": "text",
"primaryKey": false,
"notNull": true,
"autoincrement": false
},
"warc_record_id": {
"name": "warc_record_id",
"type": "text",
"primaryKey": false,
"notNull": false,
"autoincrement": false
},
"parser_version": {
"name": "parser_version",
"type": "integer",
"primaryKey": false,
"notNull": false,
"autoincrement": false
},
"name": {
"name": "name",
"type": "text",
"primaryKey": false,
"notNull": false,
"autoincrement": false
},
"image_url": {
"name": "image_url",
"type": "text",
"primaryKey": false,
"notNull": false,
"autoincrement": false
}
},
"indexes": {},
"foreignKeys": {},
"compositePrimaryKeys": {},
"uniqueConstraints": {}
}
},
"enums": {},
"_meta": {
"schemas": {},
"tables": {},
"columns": {}
}
}

View file

@ -15,6 +15,13 @@
"when": 1703374278842, "when": 1703374278842,
"tag": "0001_spotty_red_hulk", "tag": "0001_spotty_red_hulk",
"breakpoints": true "breakpoints": true
},
{
"idx": 2,
"version": "5",
"when": 1703452301821,
"tag": "0002_wild_amazoness",
"breakpoints": true
} }
] ]
} }

View file

@ -1,11 +1,11 @@
import { migrate } from "drizzle-orm/better-sqlite3/migrator"; import Database from "bun:sqlite";
import Database from "better-sqlite3"; import { drizzle } from "drizzle-orm/bun-sqlite";
import { drizzle } from "drizzle-orm/better-sqlite3"; import { migrate } from "drizzle-orm/bun-sqlite/migrator";
import * as schema from "./schema.js"; import * as schema from "./schema.js";
const sqlite = new Database("../scraper/sqlite.db"); const sqlite = new Database("../scraper/sqlite.db");
const db = drizzle(sqlite, { schema }); const db = drizzle(sqlite, { schema });
await migrate(db, { migrationsFolder: "./drizzle" }); migrate(db, { migrationsFolder: "./drizzle" });
sqlite.close(); sqlite.close();

View file

@ -5,7 +5,8 @@
"description": "", "description": "",
"main": "index.js", "main": "index.js",
"scripts": { "scripts": {
"migrate": "node --import tsx/esm migrate.ts" "migrate": "bun migrate.ts",
"generate": "drizzle-kit generate:sqlite"
}, },
"keywords": [], "keywords": [],
"author": "", "author": "",

View file

@ -9,6 +9,8 @@ export const precios = sqliteTable("precios", {
url: text("url").notNull(), url: text("url").notNull(),
warcRecordId: text("warc_record_id"), warcRecordId: text("warc_record_id"),
parserVersion: integer("parser_version"), parserVersion: integer("parser_version"),
name: text("name"),
imageUrl: text("image_url"),
}); });
export type Precio = typeof precios.$inferSelect; export type Precio = typeof precios.$inferSelect;

11
package.json Normal file
View file

@ -0,0 +1,11 @@
{
"name": "preciazo",
"private": true,
"workspaces": [
"dia-link-scraper",
"coto-link-scraper",
"scraper",
"sitio",
"db-datos"
]
}

File diff suppressed because it is too large Load diff

View file

@ -1,6 +0,0 @@
packages:
- "dia-link-scraper"
- "coto-link-scraper"
- "scraper"
- "sitio"
- "db-datos"

View file

@ -1,40 +0,0 @@
// import { run, bench, group, baseline } from "mitata";
import { createReadStream } from "node:fs";
import { Writable } from "node:stream";
import { pipeline } from "node:stream/promises";
import { getCarrefourProduct } from "./carrefour.js";
import { WARCParser } from "warcio";
// import { ZSTDDecompress } from "simple-zstd";
// import { AutoWARCParser } from "node-warc";
// const html = await readFile("carrefour.html", "utf-8");
// bench("carrefour", () => {
// getCarrefourProduct(html);
// });
// await bench("warcio", async () => {
// const warc = Bun.spawn(
// ["zstd", "-do", "/dev/stdout", "../data/carrefour.warc.zst"],
// {
// // stdin: Bun.file().stream(),
// }
// ).stdout;
// // const warc = Bun.file("../data/carrefour.warc").stream(1024 * 1024 * 512);
// // const warc = createReadStream("../data/carrefour.warc.zst").pipe(ZSTDDecompress());
// const parser = new WARCParser(warc);
// for await (const record of parser) {
// const html = await record.contentText();
// }
// });
// await bench("warc", );
async function bench(name: string, func: () => Promise<void>) {
const t0 = performance.now();
await func();
const t1 = performance.now();
console.debug(`${name} took ${t1 - t0}`);
}
// await run({});

View file

@ -1,27 +0,0 @@
import { readFile, writeFile } from "fs/promises";
import pMap from "p-map";
import { nanoid } from "nanoid";
import { getHtml } from "./fetch.js";
import { join } from "path";
(async () => {
const inputPath = process.argv[2];
const outputPath = process.argv[3];
if (!inputPath || !outputPath) {
console.error("falta input y/o output");
process.exit(1);
}
const file = await readFile(inputPath, "utf-8");
const urls = file.split("\n");
await pMap(
urls,
async (url: string) => {
const id = nanoid();
const html = await getHtml(url);
await writeFile(join(outputPath, `${id}.link`), url);
await writeFile(join(outputPath, id), html);
},
{ concurrency: 12 }
);
})();

View file

@ -1,6 +1,6 @@
import { parseHTML } from "linkedom"; import { parseHTML } from "linkedom";
import { Precioish, type Precio } from "./scrap.js"; import { Precioish, type Precio } from "../scrap.js";
import { getProductJsonLd, priceFromMeta } from "./common.js"; import { getProductJsonLd, priceFromMeta } from "../common.js";
function getEanByTable(dom: Window): string { function getEanByTable(dom: Window): string {
const eanLabelEl = dom.window.document.querySelector( const eanLabelEl = dom.window.document.querySelector(
@ -55,14 +55,17 @@ export function getCarrefourProduct(html: string | Buffer): Precioish {
const precioCentavos = priceFromMeta(dom); const precioCentavos = priceFromMeta(dom);
// const productLd = findJsonLd(dom, "Product");
const ean = eanFromSeedState(dom); const ean = eanFromSeedState(dom);
const ld = getProductJsonLd(dom); const ld = getProductJsonLd(dom);
const name = ld.name;
const imageUrl = ld.image;
const inStock = const inStock =
ld.offers.offers[0].availability === "http://schema.org/InStock"; ld.offers.offers[0].availability === "http://schema.org/InStock";
return { return {
name,
imageUrl,
ean, ean,
precioCentavos, precioCentavos,
inStock, inStock,

View file

@ -1,5 +1,5 @@
import { parseHTML } from "linkedom"; import { parseHTML } from "linkedom";
import { type Precioish } from "./scrap.js"; import { type Precioish } from "../scrap.js";
function getEanFromText({ document }: Window) { function getEanFromText({ document }: Window) {
const potentialEanEls = Array.from( const potentialEanEls = Array.from(
@ -34,5 +34,10 @@ export function getCotoProduct(html: string | Buffer): Precioish {
const ean = getEanFromText(dom); const ean = getEanFromText(dom);
const precioCentavos = getPriceFromText(dom); const precioCentavos = getPriceFromText(dom);
return { ean, precioCentavos }; const name = dom.document.querySelector("h1.product_page")?.textContent;
const imageUrl = dom.document.querySelector<HTMLImageElement>(
".productImageZoom img"
)?.src;
return { name, imageUrl, ean, precioCentavos };
} }

View file

@ -1,6 +1,6 @@
import { parseHTML } from "linkedom"; import { parseHTML } from "linkedom";
import { type Precioish } from "./scrap.js"; import { type Precioish } from "../scrap.js";
import { getMetaProp, getProductJsonLd, priceFromMeta } from "./common.js"; import { getMetaProp, getProductJsonLd, priceFromMeta } from "../common.js";
export function getDiaProduct(html: string | Buffer): Precioish { export function getDiaProduct(html: string | Buffer): Precioish {
const dom = parseHTML(html); const dom = parseHTML(html);
@ -10,10 +10,14 @@ export function getDiaProduct(html: string | Buffer): Precioish {
const precioCentavos = priceFromMeta(dom); const precioCentavos = priceFromMeta(dom);
const ld = getProductJsonLd(dom); const ld = getProductJsonLd(dom);
const name = ld.name;
const imageUrl = ld.image;
const inStock = const inStock =
ld.offers.offers[0].availability === "http://schema.org/InStock"; ld.offers.offers[0].availability === "http://schema.org/InStock";
return { return {
name,
imageUrl,
ean, ean,
precioCentavos, precioCentavos,
inStock, inStock,

View file

@ -1,20 +1,18 @@
/// <reference lib="dom" />
/// <reference lib="dom.iterable" />
import { Database } from "bun:sqlite"; import { Database } from "bun:sqlite";
import { drizzle } from "drizzle-orm/bun-sqlite"; import { drizzle } from "drizzle-orm/bun-sqlite";
import * as schema from "db-datos/schema.js"; import * as schema from "db-datos/schema.js";
import { WARCParser } from "warcio"; import { WARCParser } from "warcio";
import { writeFile } from "fs/promises"; import { writeFile } from "fs/promises";
import { createHash } from "crypto"; import { createHash } from "crypto";
import { getCarrefourProduct } from "./carrefour.js"; import { getCarrefourProduct } from "./parsers/carrefour.js";
import { getDiaProduct } from "./dia.js"; import { getDiaProduct } from "./parsers/dia.js";
import { getCotoProduct } from "./coto.js"; import { getCotoProduct } from "./parsers/coto.js";
import { join } from "path"; import { join } from "path";
import pMap from "p-map"; import pMap from "p-map";
import { and, eq, sql } from "drizzle-orm"; import { and, eq, sql } from "drizzle-orm";
const DEBUG = false; const DEBUG = false;
const PARSER_VERSION = 1; const PARSER_VERSION = 2;
const sqlite = new Database("sqlite.db"); const sqlite = new Database("sqlite.db");
const db = drizzle(sqlite, { schema }); const db = drizzle(sqlite, { schema });

10
sitio/Containerfile Normal file
View file

@ -0,0 +1,10 @@
FROM docker.io/oven/bun:1-alpine
COPY build/ .
RUN bun i
EXPOSE 3000
# https://github.com/gornostay25/svelte-adapter-bun/issues/39
ENV PROTOCOL_HEADER=x-forwarded-proto
ENV HOST_HEADER=x-forwarded-host
CMD ["bun", "run", "start"]

View file

@ -12,10 +12,9 @@
"format": "prettier --write ." "format": "prettier --write ."
}, },
"devDependencies": { "devDependencies": {
"@sveltejs/adapter-node": "^2.0.2",
"@sveltejs/kit": "^2.0.0", "@sveltejs/kit": "^2.0.0",
"@sveltejs/vite-plugin-svelte": "^3.0.0", "@sveltejs/vite-plugin-svelte": "^3.0.0",
"@types/better-sqlite3": "^7.6.8", "@types/bun": "^1.0.0",
"autoprefixer": "^10.4.16", "autoprefixer": "^10.4.16",
"db-datos": "workspace:^", "db-datos": "workspace:^",
"postcss": "^8.4.32", "postcss": "^8.4.32",
@ -24,6 +23,7 @@
"prettier-plugin-svelte": "^3.1.2", "prettier-plugin-svelte": "^3.1.2",
"prettier-plugin-tailwindcss": "^0.5.9", "prettier-plugin-tailwindcss": "^0.5.9",
"svelte": "^4.2.7", "svelte": "^4.2.7",
"svelte-adapter-bun": "^0.5.1",
"svelte-check": "^3.6.0", "svelte-check": "^3.6.0",
"tailwindcss": "^3.3.6", "tailwindcss": "^3.3.6",
"tslib": "^2.4.1", "tslib": "^2.4.1",
@ -32,7 +32,6 @@
}, },
"type": "module", "type": "module",
"dependencies": { "dependencies": {
"better-sqlite3": "^9.2.2",
"chart.js": "^4.4.1", "chart.js": "^4.4.1",
"chartjs-adapter-dayjs-4": "^1.0.4", "chartjs-adapter-dayjs-4": "^1.0.4",
"dayjs": "^1.11.10", "dayjs": "^1.11.10",

View file

@ -1,5 +1,5 @@
import Database from "better-sqlite3"; import Database from "bun:sqlite";
import { drizzle } from "drizzle-orm/better-sqlite3"; import { drizzle } from "drizzle-orm/bun-sqlite";
import * as schema from "db-datos/schema.js"; import * as schema from "db-datos/schema.js";
import { env } from "$env/dynamic/private"; import { env } from "$env/dynamic/private";

View file

@ -1,18 +1,17 @@
import { error } from "@sveltejs/kit"; import { error } from "@sveltejs/kit";
import type { PageServerLoad } from "./$types"; import type { PageServerLoad } from "./$types";
import { db, schema } from "$lib/server/db"; import { db, schema } from "$lib/server/db";
import { ilike, like, sql } from "drizzle-orm"; const { precios } = schema;
import { sql } from "drizzle-orm";
export const load: PageServerLoad = async ({ params }) => { export const load: PageServerLoad = async ({ params }) => {
const q = db const q = db
.select({ ean: schema.precios.ean }) .select({ ean: precios.ean, name: precios.name })
.from(schema.precios) .from(precios)
.where( .groupBy(precios.ean)
like(schema.precios.url, `https://diaonline.supermercadosdia.com.ar%`), .having(sql`max(length(name))`)
)
.groupBy(schema.precios.ean)
.orderBy(sql`random()`) .orderBy(sql`random()`)
.limit(150); .limit(150);
const precios = await q; const res = await q;
return { precios }; return { precios: res };
}; };

View file

@ -10,7 +10,7 @@
{#each data.precios as product} {#each data.precios as product}
<li> <li>
<a href={`/ean/${product.ean}`}> <a href={`/ean/${product.ean}`}>
{product.ean} {product.name}
</a> </a>
</li> </li>
{/each} {/each}

View file

@ -1,13 +1,20 @@
import { error } from "@sveltejs/kit"; import { error } from "@sveltejs/kit";
import { eq } from "drizzle-orm"; import { eq, max } from "drizzle-orm";
import type { PageServerLoad } from "./$types"; import type { PageServerLoad } from "./$types";
import { db, schema } from "$lib/server/db"; import { db, schema } from "$lib/server/db";
const { precios } = schema;
export const load: PageServerLoad = async ({ params }) => { export const load: PageServerLoad = async ({ params }) => {
const precios = await db.query.precios.findMany({ const q = db
where: eq(schema.precios.ean, params.ean), .select()
}); .from(precios)
if (precios.length === 0) return error(404, "Not Found"); .where(eq(precios.ean, params.ean))
.groupBy(precios.warcRecordId)
.having(max(precios.parserVersion));
const res = await q;
if (res.length === 0) return error(404, "Not Found");
return { precios }; const meta = res.find((p) => p.name);
return { precios: res, meta };
}; };

View file

@ -5,6 +5,11 @@
export let data: PageData; export let data: PageData;
</script> </script>
{#if data.meta}
<h1 class="text-3xl font-bold">{data.meta.name}</h1>
<img src={data.meta.imageUrl} class="max-h-48" />
{/if}
<ul> <ul>
{#each data.precios as precio} {#each data.precios as precio}
<li> <li>

View file

@ -1,5 +1,5 @@
import adapter from "@sveltejs/adapter-node"; // import adapter from "@sveltejs/adapter-node";
// import adapter from "svelte-adapter-bun"; import adapter from "svelte-adapter-bun";
import { vitePreprocess } from "@sveltejs/vite-plugin-svelte"; import { vitePreprocess } from "@sveltejs/vite-plugin-svelte";
/** @type {import('@sveltejs/kit').Config} */ /** @type {import('@sveltejs/kit').Config} */