From 925175ba9d85df7e6718d046d514917314ce34f8 Mon Sep 17 00:00:00 2001 From: Nulo Date: Sun, 24 Dec 2023 19:21:51 -0300 Subject: [PATCH] parsear name y imageUrl --- db-datos/drizzle/0002_wild_amazoness.sql | 2 + db-datos/drizzle/meta/0002_snapshot.json | 93 ++++++++++++++++++++++ db-datos/drizzle/meta/_journal.json | 7 ++ db-datos/schema.ts | 2 + scraper/parsers/carrefour.ts | 5 +- scraper/parsers/coto.ts | 7 +- scraper/parsers/dia.ts | 4 + scraper/scrap.ts | 4 +- sitio/src/routes/+page.server.ts | 17 ++-- sitio/src/routes/+page.svelte | 2 +- sitio/src/routes/ean/[ean]/+page.server.ts | 19 +++-- sitio/src/routes/ean/[ean]/+page.svelte | 5 ++ 12 files changed, 146 insertions(+), 21 deletions(-) create mode 100644 db-datos/drizzle/0002_wild_amazoness.sql create mode 100644 db-datos/drizzle/meta/0002_snapshot.json diff --git a/db-datos/drizzle/0002_wild_amazoness.sql b/db-datos/drizzle/0002_wild_amazoness.sql new file mode 100644 index 0000000..8099b7f --- /dev/null +++ b/db-datos/drizzle/0002_wild_amazoness.sql @@ -0,0 +1,2 @@ +ALTER TABLE precios ADD `name` text;--> statement-breakpoint +ALTER TABLE precios ADD `image_url` text; \ No newline at end of file diff --git a/db-datos/drizzle/meta/0002_snapshot.json b/db-datos/drizzle/meta/0002_snapshot.json new file mode 100644 index 0000000..8acddad --- /dev/null +++ b/db-datos/drizzle/meta/0002_snapshot.json @@ -0,0 +1,93 @@ +{ + "version": "5", + "dialect": "sqlite", + "id": "cbd90a60-7568-489f-ac45-95bd8818ffbd", + "prevId": "a565621c-046e-4f4d-b505-104e2c4f2b6c", + "tables": { + "precios": { + "name": "precios", + "columns": { + "id": { + "name": "id", + "type": "integer", + "primaryKey": true, + "notNull": true, + "autoincrement": true + }, + "ean": { + "name": "ean", + "type": "text", + "primaryKey": false, + "notNull": true, + "autoincrement": false + }, + "fetched_at": { + "name": "fetched_at", + "type": "integer", + "primaryKey": false, + "notNull": true, + "autoincrement": false + }, + "precio_centavos": { + "name": "precio_centavos", + "type": "integer", + "primaryKey": false, + "notNull": false, + "autoincrement": false + }, + "in_stock": { + "name": "in_stock", + "type": "integer", + "primaryKey": false, + "notNull": false, + "autoincrement": false + }, + "url": { + "name": "url", + "type": "text", + "primaryKey": false, + "notNull": true, + "autoincrement": false + }, + "warc_record_id": { + "name": "warc_record_id", + "type": "text", + "primaryKey": false, + "notNull": false, + "autoincrement": false + }, + "parser_version": { + "name": "parser_version", + "type": "integer", + "primaryKey": false, + "notNull": false, + "autoincrement": false + }, + "name": { + "name": "name", + "type": "text", + "primaryKey": false, + "notNull": false, + "autoincrement": false + }, + "image_url": { + "name": "image_url", + "type": "text", + "primaryKey": false, + "notNull": false, + "autoincrement": false + } + }, + "indexes": {}, + "foreignKeys": {}, + "compositePrimaryKeys": {}, + "uniqueConstraints": {} + } + }, + "enums": {}, + "_meta": { + "schemas": {}, + "tables": {}, + "columns": {} + } +} \ No newline at end of file diff --git a/db-datos/drizzle/meta/_journal.json b/db-datos/drizzle/meta/_journal.json index 4425484..708d59d 100644 --- a/db-datos/drizzle/meta/_journal.json +++ b/db-datos/drizzle/meta/_journal.json @@ -15,6 +15,13 @@ "when": 1703374278842, "tag": "0001_spotty_red_hulk", "breakpoints": true + }, + { + "idx": 2, + "version": "5", + "when": 1703452301821, + "tag": "0002_wild_amazoness", + "breakpoints": true } ] } \ No newline at end of file diff --git a/db-datos/schema.ts b/db-datos/schema.ts index 4f15736..0c58c08 100644 --- a/db-datos/schema.ts +++ b/db-datos/schema.ts @@ -9,6 +9,8 @@ export const precios = sqliteTable("precios", { url: text("url").notNull(), warcRecordId: text("warc_record_id"), parserVersion: integer("parser_version"), + name: text("name"), + imageUrl: text("image_url"), }); export type Precio = typeof precios.$inferSelect; diff --git a/scraper/parsers/carrefour.ts b/scraper/parsers/carrefour.ts index 788c4a7..aa32e11 100644 --- a/scraper/parsers/carrefour.ts +++ b/scraper/parsers/carrefour.ts @@ -55,14 +55,17 @@ export function getCarrefourProduct(html: string | Buffer): Precioish { const precioCentavos = priceFromMeta(dom); - // const productLd = findJsonLd(dom, "Product"); const ean = eanFromSeedState(dom); const ld = getProductJsonLd(dom); + const name = ld.name; + const imageUrl = ld.image; const inStock = ld.offers.offers[0].availability === "http://schema.org/InStock"; return { + name, + imageUrl, ean, precioCentavos, inStock, diff --git a/scraper/parsers/coto.ts b/scraper/parsers/coto.ts index 5fda36f..31daaee 100644 --- a/scraper/parsers/coto.ts +++ b/scraper/parsers/coto.ts @@ -34,5 +34,10 @@ export function getCotoProduct(html: string | Buffer): Precioish { const ean = getEanFromText(dom); const precioCentavos = getPriceFromText(dom); - return { ean, precioCentavos }; + const name = dom.document.querySelector("h1.product_page")?.textContent; + const imageUrl = dom.document.querySelector( + ".productImageZoom img" + )?.src; + + return { name, imageUrl, ean, precioCentavos }; } diff --git a/scraper/parsers/dia.ts b/scraper/parsers/dia.ts index 0779da2..be3e2c5 100644 --- a/scraper/parsers/dia.ts +++ b/scraper/parsers/dia.ts @@ -10,10 +10,14 @@ export function getDiaProduct(html: string | Buffer): Precioish { const precioCentavos = priceFromMeta(dom); const ld = getProductJsonLd(dom); + const name = ld.name; + const imageUrl = ld.image; const inStock = ld.offers.offers[0].availability === "http://schema.org/InStock"; return { + name, + imageUrl, ean, precioCentavos, inStock, diff --git a/scraper/scrap.ts b/scraper/scrap.ts index 34bed3e..a353354 100644 --- a/scraper/scrap.ts +++ b/scraper/scrap.ts @@ -1,5 +1,3 @@ -/// -/// import { Database } from "bun:sqlite"; import { drizzle } from "drizzle-orm/bun-sqlite"; import * as schema from "db-datos/schema.js"; @@ -14,7 +12,7 @@ import pMap from "p-map"; import { and, eq, sql } from "drizzle-orm"; const DEBUG = false; -const PARSER_VERSION = 1; +const PARSER_VERSION = 2; const sqlite = new Database("sqlite.db"); const db = drizzle(sqlite, { schema }); diff --git a/sitio/src/routes/+page.server.ts b/sitio/src/routes/+page.server.ts index fbadbf5..0765e3e 100644 --- a/sitio/src/routes/+page.server.ts +++ b/sitio/src/routes/+page.server.ts @@ -1,18 +1,17 @@ import { error } from "@sveltejs/kit"; import type { PageServerLoad } from "./$types"; import { db, schema } from "$lib/server/db"; -import { ilike, like, sql } from "drizzle-orm"; +const { precios } = schema; +import { sql } from "drizzle-orm"; export const load: PageServerLoad = async ({ params }) => { const q = db - .select({ ean: schema.precios.ean }) - .from(schema.precios) - .where( - like(schema.precios.url, `https://diaonline.supermercadosdia.com.ar%`), - ) - .groupBy(schema.precios.ean) + .select({ ean: precios.ean, name: precios.name }) + .from(precios) + .groupBy(precios.ean) + .having(sql`max(length(name))`) .orderBy(sql`random()`) .limit(150); - const precios = await q; - return { precios }; + const res = await q; + return { precios: res }; }; diff --git a/sitio/src/routes/+page.svelte b/sitio/src/routes/+page.svelte index a17b97a..26d79aa 100644 --- a/sitio/src/routes/+page.svelte +++ b/sitio/src/routes/+page.svelte @@ -10,7 +10,7 @@ {#each data.precios as product}
  • - {product.ean} + {product.name}
  • {/each} diff --git a/sitio/src/routes/ean/[ean]/+page.server.ts b/sitio/src/routes/ean/[ean]/+page.server.ts index 4ddc26d..5eb887a 100644 --- a/sitio/src/routes/ean/[ean]/+page.server.ts +++ b/sitio/src/routes/ean/[ean]/+page.server.ts @@ -1,13 +1,20 @@ import { error } from "@sveltejs/kit"; -import { eq } from "drizzle-orm"; +import { eq, max } from "drizzle-orm"; import type { PageServerLoad } from "./$types"; import { db, schema } from "$lib/server/db"; +const { precios } = schema; export const load: PageServerLoad = async ({ params }) => { - const precios = await db.query.precios.findMany({ - where: eq(schema.precios.ean, params.ean), - }); - if (precios.length === 0) return error(404, "Not Found"); + const q = db + .select() + .from(precios) + .where(eq(precios.ean, params.ean)) + .groupBy(precios.warcRecordId) + .having(max(precios.parserVersion)); + const res = await q; + if (res.length === 0) return error(404, "Not Found"); - return { precios }; + const meta = res.find((p) => p.name); + + return { precios: res, meta }; }; diff --git a/sitio/src/routes/ean/[ean]/+page.svelte b/sitio/src/routes/ean/[ean]/+page.svelte index 87b4f73..bb05a7f 100644 --- a/sitio/src/routes/ean/[ean]/+page.svelte +++ b/sitio/src/routes/ean/[ean]/+page.svelte @@ -5,6 +5,11 @@ export let data: PageData; +{#if data.meta} +

    {data.meta.name}

    + +{/if} +
      {#each data.precios as precio}