parsear name y imageUrl

This commit is contained in:
Cat /dev/Nulo 2023-12-24 19:21:51 -03:00
parent 08f62c78db
commit 925175ba9d
12 changed files with 146 additions and 21 deletions

View file

@ -0,0 +1,2 @@
ALTER TABLE precios ADD `name` text;--> statement-breakpoint
ALTER TABLE precios ADD `image_url` text;

View file

@ -0,0 +1,93 @@
{
"version": "5",
"dialect": "sqlite",
"id": "cbd90a60-7568-489f-ac45-95bd8818ffbd",
"prevId": "a565621c-046e-4f4d-b505-104e2c4f2b6c",
"tables": {
"precios": {
"name": "precios",
"columns": {
"id": {
"name": "id",
"type": "integer",
"primaryKey": true,
"notNull": true,
"autoincrement": true
},
"ean": {
"name": "ean",
"type": "text",
"primaryKey": false,
"notNull": true,
"autoincrement": false
},
"fetched_at": {
"name": "fetched_at",
"type": "integer",
"primaryKey": false,
"notNull": true,
"autoincrement": false
},
"precio_centavos": {
"name": "precio_centavos",
"type": "integer",
"primaryKey": false,
"notNull": false,
"autoincrement": false
},
"in_stock": {
"name": "in_stock",
"type": "integer",
"primaryKey": false,
"notNull": false,
"autoincrement": false
},
"url": {
"name": "url",
"type": "text",
"primaryKey": false,
"notNull": true,
"autoincrement": false
},
"warc_record_id": {
"name": "warc_record_id",
"type": "text",
"primaryKey": false,
"notNull": false,
"autoincrement": false
},
"parser_version": {
"name": "parser_version",
"type": "integer",
"primaryKey": false,
"notNull": false,
"autoincrement": false
},
"name": {
"name": "name",
"type": "text",
"primaryKey": false,
"notNull": false,
"autoincrement": false
},
"image_url": {
"name": "image_url",
"type": "text",
"primaryKey": false,
"notNull": false,
"autoincrement": false
}
},
"indexes": {},
"foreignKeys": {},
"compositePrimaryKeys": {},
"uniqueConstraints": {}
}
},
"enums": {},
"_meta": {
"schemas": {},
"tables": {},
"columns": {}
}
}

View file

@ -15,6 +15,13 @@
"when": 1703374278842, "when": 1703374278842,
"tag": "0001_spotty_red_hulk", "tag": "0001_spotty_red_hulk",
"breakpoints": true "breakpoints": true
},
{
"idx": 2,
"version": "5",
"when": 1703452301821,
"tag": "0002_wild_amazoness",
"breakpoints": true
} }
] ]
} }

View file

@ -9,6 +9,8 @@ export const precios = sqliteTable("precios", {
url: text("url").notNull(), url: text("url").notNull(),
warcRecordId: text("warc_record_id"), warcRecordId: text("warc_record_id"),
parserVersion: integer("parser_version"), parserVersion: integer("parser_version"),
name: text("name"),
imageUrl: text("image_url"),
}); });
export type Precio = typeof precios.$inferSelect; export type Precio = typeof precios.$inferSelect;

View file

@ -55,14 +55,17 @@ export function getCarrefourProduct(html: string | Buffer): Precioish {
const precioCentavos = priceFromMeta(dom); const precioCentavos = priceFromMeta(dom);
// const productLd = findJsonLd(dom, "Product");
const ean = eanFromSeedState(dom); const ean = eanFromSeedState(dom);
const ld = getProductJsonLd(dom); const ld = getProductJsonLd(dom);
const name = ld.name;
const imageUrl = ld.image;
const inStock = const inStock =
ld.offers.offers[0].availability === "http://schema.org/InStock"; ld.offers.offers[0].availability === "http://schema.org/InStock";
return { return {
name,
imageUrl,
ean, ean,
precioCentavos, precioCentavos,
inStock, inStock,

View file

@ -34,5 +34,10 @@ export function getCotoProduct(html: string | Buffer): Precioish {
const ean = getEanFromText(dom); const ean = getEanFromText(dom);
const precioCentavos = getPriceFromText(dom); const precioCentavos = getPriceFromText(dom);
return { ean, precioCentavos }; const name = dom.document.querySelector("h1.product_page")?.textContent;
const imageUrl = dom.document.querySelector<HTMLImageElement>(
".productImageZoom img"
)?.src;
return { name, imageUrl, ean, precioCentavos };
} }

View file

@ -10,10 +10,14 @@ export function getDiaProduct(html: string | Buffer): Precioish {
const precioCentavos = priceFromMeta(dom); const precioCentavos = priceFromMeta(dom);
const ld = getProductJsonLd(dom); const ld = getProductJsonLd(dom);
const name = ld.name;
const imageUrl = ld.image;
const inStock = const inStock =
ld.offers.offers[0].availability === "http://schema.org/InStock"; ld.offers.offers[0].availability === "http://schema.org/InStock";
return { return {
name,
imageUrl,
ean, ean,
precioCentavos, precioCentavos,
inStock, inStock,

View file

@ -1,5 +1,3 @@
/// <reference lib="dom" />
/// <reference lib="dom.iterable" />
import { Database } from "bun:sqlite"; import { Database } from "bun:sqlite";
import { drizzle } from "drizzle-orm/bun-sqlite"; import { drizzle } from "drizzle-orm/bun-sqlite";
import * as schema from "db-datos/schema.js"; import * as schema from "db-datos/schema.js";
@ -14,7 +12,7 @@ import pMap from "p-map";
import { and, eq, sql } from "drizzle-orm"; import { and, eq, sql } from "drizzle-orm";
const DEBUG = false; const DEBUG = false;
const PARSER_VERSION = 1; const PARSER_VERSION = 2;
const sqlite = new Database("sqlite.db"); const sqlite = new Database("sqlite.db");
const db = drizzle(sqlite, { schema }); const db = drizzle(sqlite, { schema });

View file

@ -1,18 +1,17 @@
import { error } from "@sveltejs/kit"; import { error } from "@sveltejs/kit";
import type { PageServerLoad } from "./$types"; import type { PageServerLoad } from "./$types";
import { db, schema } from "$lib/server/db"; import { db, schema } from "$lib/server/db";
import { ilike, like, sql } from "drizzle-orm"; const { precios } = schema;
import { sql } from "drizzle-orm";
export const load: PageServerLoad = async ({ params }) => { export const load: PageServerLoad = async ({ params }) => {
const q = db const q = db
.select({ ean: schema.precios.ean }) .select({ ean: precios.ean, name: precios.name })
.from(schema.precios) .from(precios)
.where( .groupBy(precios.ean)
like(schema.precios.url, `https://diaonline.supermercadosdia.com.ar%`), .having(sql`max(length(name))`)
)
.groupBy(schema.precios.ean)
.orderBy(sql`random()`) .orderBy(sql`random()`)
.limit(150); .limit(150);
const precios = await q; const res = await q;
return { precios }; return { precios: res };
}; };

View file

@ -10,7 +10,7 @@
{#each data.precios as product} {#each data.precios as product}
<li> <li>
<a href={`/ean/${product.ean}`}> <a href={`/ean/${product.ean}`}>
{product.ean} {product.name}
</a> </a>
</li> </li>
{/each} {/each}

View file

@ -1,13 +1,20 @@
import { error } from "@sveltejs/kit"; import { error } from "@sveltejs/kit";
import { eq } from "drizzle-orm"; import { eq, max } from "drizzle-orm";
import type { PageServerLoad } from "./$types"; import type { PageServerLoad } from "./$types";
import { db, schema } from "$lib/server/db"; import { db, schema } from "$lib/server/db";
const { precios } = schema;
export const load: PageServerLoad = async ({ params }) => { export const load: PageServerLoad = async ({ params }) => {
const precios = await db.query.precios.findMany({ const q = db
where: eq(schema.precios.ean, params.ean), .select()
}); .from(precios)
if (precios.length === 0) return error(404, "Not Found"); .where(eq(precios.ean, params.ean))
.groupBy(precios.warcRecordId)
.having(max(precios.parserVersion));
const res = await q;
if (res.length === 0) return error(404, "Not Found");
return { precios }; const meta = res.find((p) => p.name);
return { precios: res, meta };
}; };

View file

@ -5,6 +5,11 @@
export let data: PageData; export let data: PageData;
</script> </script>
{#if data.meta}
<h1 class="text-3xl font-bold">{data.meta.name}</h1>
<img src={data.meta.imageUrl} class="max-h-48" />
{/if}
<ul> <ul>
{#each data.precios as precio} {#each data.precios as precio}
<li> <li>