mirror of
https://github.com/catdevnull/preciazo.git
synced 2024-11-29 13:06:19 +00:00
Compare commits
6 commits
8ccd69de1b
...
925175ba9d
Author | SHA1 | Date | |
---|---|---|---|
925175ba9d | |||
08f62c78db | |||
794247d657 | |||
f2f5c7afdd | |||
f5cc28f44b | |||
3a6fe55da4 |
24 changed files with 355 additions and 402 deletions
BIN
bun.lockb
Executable file
BIN
bun.lockb
Executable file
Binary file not shown.
2
db-datos/drizzle/0002_wild_amazoness.sql
Normal file
2
db-datos/drizzle/0002_wild_amazoness.sql
Normal file
|
@ -0,0 +1,2 @@
|
|||
ALTER TABLE precios ADD `name` text;--> statement-breakpoint
|
||||
ALTER TABLE precios ADD `image_url` text;
|
93
db-datos/drizzle/meta/0002_snapshot.json
Normal file
93
db-datos/drizzle/meta/0002_snapshot.json
Normal file
|
@ -0,0 +1,93 @@
|
|||
{
|
||||
"version": "5",
|
||||
"dialect": "sqlite",
|
||||
"id": "cbd90a60-7568-489f-ac45-95bd8818ffbd",
|
||||
"prevId": "a565621c-046e-4f4d-b505-104e2c4f2b6c",
|
||||
"tables": {
|
||||
"precios": {
|
||||
"name": "precios",
|
||||
"columns": {
|
||||
"id": {
|
||||
"name": "id",
|
||||
"type": "integer",
|
||||
"primaryKey": true,
|
||||
"notNull": true,
|
||||
"autoincrement": true
|
||||
},
|
||||
"ean": {
|
||||
"name": "ean",
|
||||
"type": "text",
|
||||
"primaryKey": false,
|
||||
"notNull": true,
|
||||
"autoincrement": false
|
||||
},
|
||||
"fetched_at": {
|
||||
"name": "fetched_at",
|
||||
"type": "integer",
|
||||
"primaryKey": false,
|
||||
"notNull": true,
|
||||
"autoincrement": false
|
||||
},
|
||||
"precio_centavos": {
|
||||
"name": "precio_centavos",
|
||||
"type": "integer",
|
||||
"primaryKey": false,
|
||||
"notNull": false,
|
||||
"autoincrement": false
|
||||
},
|
||||
"in_stock": {
|
||||
"name": "in_stock",
|
||||
"type": "integer",
|
||||
"primaryKey": false,
|
||||
"notNull": false,
|
||||
"autoincrement": false
|
||||
},
|
||||
"url": {
|
||||
"name": "url",
|
||||
"type": "text",
|
||||
"primaryKey": false,
|
||||
"notNull": true,
|
||||
"autoincrement": false
|
||||
},
|
||||
"warc_record_id": {
|
||||
"name": "warc_record_id",
|
||||
"type": "text",
|
||||
"primaryKey": false,
|
||||
"notNull": false,
|
||||
"autoincrement": false
|
||||
},
|
||||
"parser_version": {
|
||||
"name": "parser_version",
|
||||
"type": "integer",
|
||||
"primaryKey": false,
|
||||
"notNull": false,
|
||||
"autoincrement": false
|
||||
},
|
||||
"name": {
|
||||
"name": "name",
|
||||
"type": "text",
|
||||
"primaryKey": false,
|
||||
"notNull": false,
|
||||
"autoincrement": false
|
||||
},
|
||||
"image_url": {
|
||||
"name": "image_url",
|
||||
"type": "text",
|
||||
"primaryKey": false,
|
||||
"notNull": false,
|
||||
"autoincrement": false
|
||||
}
|
||||
},
|
||||
"indexes": {},
|
||||
"foreignKeys": {},
|
||||
"compositePrimaryKeys": {},
|
||||
"uniqueConstraints": {}
|
||||
}
|
||||
},
|
||||
"enums": {},
|
||||
"_meta": {
|
||||
"schemas": {},
|
||||
"tables": {},
|
||||
"columns": {}
|
||||
}
|
||||
}
|
|
@ -15,6 +15,13 @@
|
|||
"when": 1703374278842,
|
||||
"tag": "0001_spotty_red_hulk",
|
||||
"breakpoints": true
|
||||
},
|
||||
{
|
||||
"idx": 2,
|
||||
"version": "5",
|
||||
"when": 1703452301821,
|
||||
"tag": "0002_wild_amazoness",
|
||||
"breakpoints": true
|
||||
}
|
||||
]
|
||||
}
|
|
@ -1,11 +1,11 @@
|
|||
import { migrate } from "drizzle-orm/better-sqlite3/migrator";
|
||||
import Database from "better-sqlite3";
|
||||
import { drizzle } from "drizzle-orm/better-sqlite3";
|
||||
import Database from "bun:sqlite";
|
||||
import { drizzle } from "drizzle-orm/bun-sqlite";
|
||||
import { migrate } from "drizzle-orm/bun-sqlite/migrator";
|
||||
import * as schema from "./schema.js";
|
||||
|
||||
const sqlite = new Database("../scraper/sqlite.db");
|
||||
const db = drizzle(sqlite, { schema });
|
||||
|
||||
await migrate(db, { migrationsFolder: "./drizzle" });
|
||||
migrate(db, { migrationsFolder: "./drizzle" });
|
||||
|
||||
sqlite.close();
|
||||
|
|
|
@ -5,7 +5,8 @@
|
|||
"description": "",
|
||||
"main": "index.js",
|
||||
"scripts": {
|
||||
"migrate": "node --import tsx/esm migrate.ts"
|
||||
"migrate": "bun migrate.ts",
|
||||
"generate": "drizzle-kit generate:sqlite"
|
||||
},
|
||||
"keywords": [],
|
||||
"author": "",
|
||||
|
|
|
@ -9,6 +9,8 @@ export const precios = sqliteTable("precios", {
|
|||
url: text("url").notNull(),
|
||||
warcRecordId: text("warc_record_id"),
|
||||
parserVersion: integer("parser_version"),
|
||||
name: text("name"),
|
||||
imageUrl: text("image_url"),
|
||||
});
|
||||
|
||||
export type Precio = typeof precios.$inferSelect;
|
||||
|
|
11
package.json
Normal file
11
package.json
Normal file
|
@ -0,0 +1,11 @@
|
|||
{
|
||||
"name": "preciazo",
|
||||
"private": true,
|
||||
"workspaces": [
|
||||
"dia-link-scraper",
|
||||
"coto-link-scraper",
|
||||
"scraper",
|
||||
"sitio",
|
||||
"db-datos"
|
||||
]
|
||||
}
|
456
pnpm-lock.yaml
456
pnpm-lock.yaml
File diff suppressed because it is too large
Load diff
|
@ -1,6 +0,0 @@
|
|||
packages:
|
||||
- "dia-link-scraper"
|
||||
- "coto-link-scraper"
|
||||
- "scraper"
|
||||
- "sitio"
|
||||
- "db-datos"
|
|
@ -1,40 +0,0 @@
|
|||
// import { run, bench, group, baseline } from "mitata";
|
||||
import { createReadStream } from "node:fs";
|
||||
import { Writable } from "node:stream";
|
||||
import { pipeline } from "node:stream/promises";
|
||||
import { getCarrefourProduct } from "./carrefour.js";
|
||||
import { WARCParser } from "warcio";
|
||||
// import { ZSTDDecompress } from "simple-zstd";
|
||||
// import { AutoWARCParser } from "node-warc";
|
||||
|
||||
// const html = await readFile("carrefour.html", "utf-8");
|
||||
// bench("carrefour", () => {
|
||||
// getCarrefourProduct(html);
|
||||
// });
|
||||
|
||||
// await bench("warcio", async () => {
|
||||
// const warc = Bun.spawn(
|
||||
// ["zstd", "-do", "/dev/stdout", "../data/carrefour.warc.zst"],
|
||||
// {
|
||||
// // stdin: Bun.file().stream(),
|
||||
// }
|
||||
// ).stdout;
|
||||
// // const warc = Bun.file("../data/carrefour.warc").stream(1024 * 1024 * 512);
|
||||
// // const warc = createReadStream("../data/carrefour.warc.zst").pipe(ZSTDDecompress());
|
||||
|
||||
// const parser = new WARCParser(warc);
|
||||
// for await (const record of parser) {
|
||||
// const html = await record.contentText();
|
||||
// }
|
||||
// });
|
||||
|
||||
// await bench("warc", );
|
||||
|
||||
async function bench(name: string, func: () => Promise<void>) {
|
||||
const t0 = performance.now();
|
||||
await func();
|
||||
const t1 = performance.now();
|
||||
console.debug(`${name} took ${t1 - t0}`);
|
||||
}
|
||||
|
||||
// await run({});
|
|
@ -1,27 +0,0 @@
|
|||
import { readFile, writeFile } from "fs/promises";
|
||||
import pMap from "p-map";
|
||||
import { nanoid } from "nanoid";
|
||||
import { getHtml } from "./fetch.js";
|
||||
import { join } from "path";
|
||||
|
||||
(async () => {
|
||||
const inputPath = process.argv[2];
|
||||
const outputPath = process.argv[3];
|
||||
if (!inputPath || !outputPath) {
|
||||
console.error("falta input y/o output");
|
||||
process.exit(1);
|
||||
}
|
||||
const file = await readFile(inputPath, "utf-8");
|
||||
const urls = file.split("\n");
|
||||
|
||||
await pMap(
|
||||
urls,
|
||||
async (url: string) => {
|
||||
const id = nanoid();
|
||||
const html = await getHtml(url);
|
||||
await writeFile(join(outputPath, `${id}.link`), url);
|
||||
await writeFile(join(outputPath, id), html);
|
||||
},
|
||||
{ concurrency: 12 }
|
||||
);
|
||||
})();
|
|
@ -1,6 +1,6 @@
|
|||
import { parseHTML } from "linkedom";
|
||||
import { Precioish, type Precio } from "./scrap.js";
|
||||
import { getProductJsonLd, priceFromMeta } from "./common.js";
|
||||
import { Precioish, type Precio } from "../scrap.js";
|
||||
import { getProductJsonLd, priceFromMeta } from "../common.js";
|
||||
|
||||
function getEanByTable(dom: Window): string {
|
||||
const eanLabelEl = dom.window.document.querySelector(
|
||||
|
@ -55,14 +55,17 @@ export function getCarrefourProduct(html: string | Buffer): Precioish {
|
|||
|
||||
const precioCentavos = priceFromMeta(dom);
|
||||
|
||||
// const productLd = findJsonLd(dom, "Product");
|
||||
const ean = eanFromSeedState(dom);
|
||||
|
||||
const ld = getProductJsonLd(dom);
|
||||
const name = ld.name;
|
||||
const imageUrl = ld.image;
|
||||
const inStock =
|
||||
ld.offers.offers[0].availability === "http://schema.org/InStock";
|
||||
|
||||
return {
|
||||
name,
|
||||
imageUrl,
|
||||
ean,
|
||||
precioCentavos,
|
||||
inStock,
|
|
@ -1,5 +1,5 @@
|
|||
import { parseHTML } from "linkedom";
|
||||
import { type Precioish } from "./scrap.js";
|
||||
import { type Precioish } from "../scrap.js";
|
||||
|
||||
function getEanFromText({ document }: Window) {
|
||||
const potentialEanEls = Array.from(
|
||||
|
@ -34,5 +34,10 @@ export function getCotoProduct(html: string | Buffer): Precioish {
|
|||
const ean = getEanFromText(dom);
|
||||
const precioCentavos = getPriceFromText(dom);
|
||||
|
||||
return { ean, precioCentavos };
|
||||
const name = dom.document.querySelector("h1.product_page")?.textContent;
|
||||
const imageUrl = dom.document.querySelector<HTMLImageElement>(
|
||||
".productImageZoom img"
|
||||
)?.src;
|
||||
|
||||
return { name, imageUrl, ean, precioCentavos };
|
||||
}
|
|
@ -1,6 +1,6 @@
|
|||
import { parseHTML } from "linkedom";
|
||||
import { type Precioish } from "./scrap.js";
|
||||
import { getMetaProp, getProductJsonLd, priceFromMeta } from "./common.js";
|
||||
import { type Precioish } from "../scrap.js";
|
||||
import { getMetaProp, getProductJsonLd, priceFromMeta } from "../common.js";
|
||||
|
||||
export function getDiaProduct(html: string | Buffer): Precioish {
|
||||
const dom = parseHTML(html);
|
||||
|
@ -10,10 +10,14 @@ export function getDiaProduct(html: string | Buffer): Precioish {
|
|||
const precioCentavos = priceFromMeta(dom);
|
||||
|
||||
const ld = getProductJsonLd(dom);
|
||||
const name = ld.name;
|
||||
const imageUrl = ld.image;
|
||||
const inStock =
|
||||
ld.offers.offers[0].availability === "http://schema.org/InStock";
|
||||
|
||||
return {
|
||||
name,
|
||||
imageUrl,
|
||||
ean,
|
||||
precioCentavos,
|
||||
inStock,
|
|
@ -1,20 +1,18 @@
|
|||
/// <reference lib="dom" />
|
||||
/// <reference lib="dom.iterable" />
|
||||
import { Database } from "bun:sqlite";
|
||||
import { drizzle } from "drizzle-orm/bun-sqlite";
|
||||
import * as schema from "db-datos/schema.js";
|
||||
import { WARCParser } from "warcio";
|
||||
import { writeFile } from "fs/promises";
|
||||
import { createHash } from "crypto";
|
||||
import { getCarrefourProduct } from "./carrefour.js";
|
||||
import { getDiaProduct } from "./dia.js";
|
||||
import { getCotoProduct } from "./coto.js";
|
||||
import { getCarrefourProduct } from "./parsers/carrefour.js";
|
||||
import { getDiaProduct } from "./parsers/dia.js";
|
||||
import { getCotoProduct } from "./parsers/coto.js";
|
||||
import { join } from "path";
|
||||
import pMap from "p-map";
|
||||
import { and, eq, sql } from "drizzle-orm";
|
||||
|
||||
const DEBUG = false;
|
||||
const PARSER_VERSION = 1;
|
||||
const PARSER_VERSION = 2;
|
||||
|
||||
const sqlite = new Database("sqlite.db");
|
||||
const db = drizzle(sqlite, { schema });
|
||||
|
|
10
sitio/Containerfile
Normal file
10
sitio/Containerfile
Normal file
|
@ -0,0 +1,10 @@
|
|||
FROM docker.io/oven/bun:1-alpine
|
||||
COPY build/ .
|
||||
RUN bun i
|
||||
EXPOSE 3000
|
||||
|
||||
# https://github.com/gornostay25/svelte-adapter-bun/issues/39
|
||||
ENV PROTOCOL_HEADER=x-forwarded-proto
|
||||
ENV HOST_HEADER=x-forwarded-host
|
||||
|
||||
CMD ["bun", "run", "start"]
|
|
@ -12,10 +12,9 @@
|
|||
"format": "prettier --write ."
|
||||
},
|
||||
"devDependencies": {
|
||||
"@sveltejs/adapter-node": "^2.0.2",
|
||||
"@sveltejs/kit": "^2.0.0",
|
||||
"@sveltejs/vite-plugin-svelte": "^3.0.0",
|
||||
"@types/better-sqlite3": "^7.6.8",
|
||||
"@types/bun": "^1.0.0",
|
||||
"autoprefixer": "^10.4.16",
|
||||
"db-datos": "workspace:^",
|
||||
"postcss": "^8.4.32",
|
||||
|
@ -24,6 +23,7 @@
|
|||
"prettier-plugin-svelte": "^3.1.2",
|
||||
"prettier-plugin-tailwindcss": "^0.5.9",
|
||||
"svelte": "^4.2.7",
|
||||
"svelte-adapter-bun": "^0.5.1",
|
||||
"svelte-check": "^3.6.0",
|
||||
"tailwindcss": "^3.3.6",
|
||||
"tslib": "^2.4.1",
|
||||
|
@ -32,7 +32,6 @@
|
|||
},
|
||||
"type": "module",
|
||||
"dependencies": {
|
||||
"better-sqlite3": "^9.2.2",
|
||||
"chart.js": "^4.4.1",
|
||||
"chartjs-adapter-dayjs-4": "^1.0.4",
|
||||
"dayjs": "^1.11.10",
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
import Database from "better-sqlite3";
|
||||
import { drizzle } from "drizzle-orm/better-sqlite3";
|
||||
import Database from "bun:sqlite";
|
||||
import { drizzle } from "drizzle-orm/bun-sqlite";
|
||||
import * as schema from "db-datos/schema.js";
|
||||
import { env } from "$env/dynamic/private";
|
||||
|
||||
|
|
|
@ -1,18 +1,17 @@
|
|||
import { error } from "@sveltejs/kit";
|
||||
import type { PageServerLoad } from "./$types";
|
||||
import { db, schema } from "$lib/server/db";
|
||||
import { ilike, like, sql } from "drizzle-orm";
|
||||
const { precios } = schema;
|
||||
import { sql } from "drizzle-orm";
|
||||
|
||||
export const load: PageServerLoad = async ({ params }) => {
|
||||
const q = db
|
||||
.select({ ean: schema.precios.ean })
|
||||
.from(schema.precios)
|
||||
.where(
|
||||
like(schema.precios.url, `https://diaonline.supermercadosdia.com.ar%`),
|
||||
)
|
||||
.groupBy(schema.precios.ean)
|
||||
.select({ ean: precios.ean, name: precios.name })
|
||||
.from(precios)
|
||||
.groupBy(precios.ean)
|
||||
.having(sql`max(length(name))`)
|
||||
.orderBy(sql`random()`)
|
||||
.limit(150);
|
||||
const precios = await q;
|
||||
return { precios };
|
||||
const res = await q;
|
||||
return { precios: res };
|
||||
};
|
||||
|
|
|
@ -10,7 +10,7 @@
|
|||
{#each data.precios as product}
|
||||
<li>
|
||||
<a href={`/ean/${product.ean}`}>
|
||||
{product.ean}
|
||||
{product.name}
|
||||
</a>
|
||||
</li>
|
||||
{/each}
|
||||
|
|
|
@ -1,13 +1,20 @@
|
|||
import { error } from "@sveltejs/kit";
|
||||
import { eq } from "drizzle-orm";
|
||||
import { eq, max } from "drizzle-orm";
|
||||
import type { PageServerLoad } from "./$types";
|
||||
import { db, schema } from "$lib/server/db";
|
||||
const { precios } = schema;
|
||||
|
||||
export const load: PageServerLoad = async ({ params }) => {
|
||||
const precios = await db.query.precios.findMany({
|
||||
where: eq(schema.precios.ean, params.ean),
|
||||
});
|
||||
if (precios.length === 0) return error(404, "Not Found");
|
||||
const q = db
|
||||
.select()
|
||||
.from(precios)
|
||||
.where(eq(precios.ean, params.ean))
|
||||
.groupBy(precios.warcRecordId)
|
||||
.having(max(precios.parserVersion));
|
||||
const res = await q;
|
||||
if (res.length === 0) return error(404, "Not Found");
|
||||
|
||||
return { precios };
|
||||
const meta = res.find((p) => p.name);
|
||||
|
||||
return { precios: res, meta };
|
||||
};
|
||||
|
|
|
@ -5,6 +5,11 @@
|
|||
export let data: PageData;
|
||||
</script>
|
||||
|
||||
{#if data.meta}
|
||||
<h1 class="text-3xl font-bold">{data.meta.name}</h1>
|
||||
<img src={data.meta.imageUrl} class="max-h-48" />
|
||||
{/if}
|
||||
|
||||
<ul>
|
||||
{#each data.precios as precio}
|
||||
<li>
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
import adapter from "@sveltejs/adapter-node";
|
||||
// import adapter from "svelte-adapter-bun";
|
||||
// import adapter from "@sveltejs/adapter-node";
|
||||
import adapter from "svelte-adapter-bun";
|
||||
import { vitePreprocess } from "@sveltejs/vite-plugin-svelte";
|
||||
|
||||
/** @type {import('@sveltejs/kit').Config} */
|
||||
|
|
Loading…
Reference in a new issue