mirror of
https://github.com/catdevnull/preciazo.git
synced 2024-11-26 03:26:19 +00:00
Compare commits
No commits in common. "925175ba9d85df7e6718d046d514917314ce34f8" and "8ccd69de1bb46ec014137b623b7f2458b581134a" have entirely different histories.
925175ba9d
...
8ccd69de1b
24 changed files with 402 additions and 355 deletions
BIN
bun.lockb
BIN
bun.lockb
Binary file not shown.
|
@ -1,2 +0,0 @@
|
|||
ALTER TABLE precios ADD `name` text;--> statement-breakpoint
|
||||
ALTER TABLE precios ADD `image_url` text;
|
|
@ -1,93 +0,0 @@
|
|||
{
|
||||
"version": "5",
|
||||
"dialect": "sqlite",
|
||||
"id": "cbd90a60-7568-489f-ac45-95bd8818ffbd",
|
||||
"prevId": "a565621c-046e-4f4d-b505-104e2c4f2b6c",
|
||||
"tables": {
|
||||
"precios": {
|
||||
"name": "precios",
|
||||
"columns": {
|
||||
"id": {
|
||||
"name": "id",
|
||||
"type": "integer",
|
||||
"primaryKey": true,
|
||||
"notNull": true,
|
||||
"autoincrement": true
|
||||
},
|
||||
"ean": {
|
||||
"name": "ean",
|
||||
"type": "text",
|
||||
"primaryKey": false,
|
||||
"notNull": true,
|
||||
"autoincrement": false
|
||||
},
|
||||
"fetched_at": {
|
||||
"name": "fetched_at",
|
||||
"type": "integer",
|
||||
"primaryKey": false,
|
||||
"notNull": true,
|
||||
"autoincrement": false
|
||||
},
|
||||
"precio_centavos": {
|
||||
"name": "precio_centavos",
|
||||
"type": "integer",
|
||||
"primaryKey": false,
|
||||
"notNull": false,
|
||||
"autoincrement": false
|
||||
},
|
||||
"in_stock": {
|
||||
"name": "in_stock",
|
||||
"type": "integer",
|
||||
"primaryKey": false,
|
||||
"notNull": false,
|
||||
"autoincrement": false
|
||||
},
|
||||
"url": {
|
||||
"name": "url",
|
||||
"type": "text",
|
||||
"primaryKey": false,
|
||||
"notNull": true,
|
||||
"autoincrement": false
|
||||
},
|
||||
"warc_record_id": {
|
||||
"name": "warc_record_id",
|
||||
"type": "text",
|
||||
"primaryKey": false,
|
||||
"notNull": false,
|
||||
"autoincrement": false
|
||||
},
|
||||
"parser_version": {
|
||||
"name": "parser_version",
|
||||
"type": "integer",
|
||||
"primaryKey": false,
|
||||
"notNull": false,
|
||||
"autoincrement": false
|
||||
},
|
||||
"name": {
|
||||
"name": "name",
|
||||
"type": "text",
|
||||
"primaryKey": false,
|
||||
"notNull": false,
|
||||
"autoincrement": false
|
||||
},
|
||||
"image_url": {
|
||||
"name": "image_url",
|
||||
"type": "text",
|
||||
"primaryKey": false,
|
||||
"notNull": false,
|
||||
"autoincrement": false
|
||||
}
|
||||
},
|
||||
"indexes": {},
|
||||
"foreignKeys": {},
|
||||
"compositePrimaryKeys": {},
|
||||
"uniqueConstraints": {}
|
||||
}
|
||||
},
|
||||
"enums": {},
|
||||
"_meta": {
|
||||
"schemas": {},
|
||||
"tables": {},
|
||||
"columns": {}
|
||||
}
|
||||
}
|
|
@ -15,13 +15,6 @@
|
|||
"when": 1703374278842,
|
||||
"tag": "0001_spotty_red_hulk",
|
||||
"breakpoints": true
|
||||
},
|
||||
{
|
||||
"idx": 2,
|
||||
"version": "5",
|
||||
"when": 1703452301821,
|
||||
"tag": "0002_wild_amazoness",
|
||||
"breakpoints": true
|
||||
}
|
||||
]
|
||||
}
|
|
@ -1,11 +1,11 @@
|
|||
import Database from "bun:sqlite";
|
||||
import { drizzle } from "drizzle-orm/bun-sqlite";
|
||||
import { migrate } from "drizzle-orm/bun-sqlite/migrator";
|
||||
import { migrate } from "drizzle-orm/better-sqlite3/migrator";
|
||||
import Database from "better-sqlite3";
|
||||
import { drizzle } from "drizzle-orm/better-sqlite3";
|
||||
import * as schema from "./schema.js";
|
||||
|
||||
const sqlite = new Database("../scraper/sqlite.db");
|
||||
const db = drizzle(sqlite, { schema });
|
||||
|
||||
migrate(db, { migrationsFolder: "./drizzle" });
|
||||
await migrate(db, { migrationsFolder: "./drizzle" });
|
||||
|
||||
sqlite.close();
|
||||
|
|
|
@ -5,8 +5,7 @@
|
|||
"description": "",
|
||||
"main": "index.js",
|
||||
"scripts": {
|
||||
"migrate": "bun migrate.ts",
|
||||
"generate": "drizzle-kit generate:sqlite"
|
||||
"migrate": "node --import tsx/esm migrate.ts"
|
||||
},
|
||||
"keywords": [],
|
||||
"author": "",
|
||||
|
|
|
@ -9,8 +9,6 @@ export const precios = sqliteTable("precios", {
|
|||
url: text("url").notNull(),
|
||||
warcRecordId: text("warc_record_id"),
|
||||
parserVersion: integer("parser_version"),
|
||||
name: text("name"),
|
||||
imageUrl: text("image_url"),
|
||||
});
|
||||
|
||||
export type Precio = typeof precios.$inferSelect;
|
||||
|
|
11
package.json
11
package.json
|
@ -1,11 +0,0 @@
|
|||
{
|
||||
"name": "preciazo",
|
||||
"private": true,
|
||||
"workspaces": [
|
||||
"dia-link-scraper",
|
||||
"coto-link-scraper",
|
||||
"scraper",
|
||||
"sitio",
|
||||
"db-datos"
|
||||
]
|
||||
}
|
456
pnpm-lock.yaml
456
pnpm-lock.yaml
File diff suppressed because it is too large
Load diff
6
pnpm-workspace.yaml
Normal file
6
pnpm-workspace.yaml
Normal file
|
@ -0,0 +1,6 @@
|
|||
packages:
|
||||
- "dia-link-scraper"
|
||||
- "coto-link-scraper"
|
||||
- "scraper"
|
||||
- "sitio"
|
||||
- "db-datos"
|
40
scraper/bench.ts
Normal file
40
scraper/bench.ts
Normal file
|
@ -0,0 +1,40 @@
|
|||
// import { run, bench, group, baseline } from "mitata";
|
||||
import { createReadStream } from "node:fs";
|
||||
import { Writable } from "node:stream";
|
||||
import { pipeline } from "node:stream/promises";
|
||||
import { getCarrefourProduct } from "./carrefour.js";
|
||||
import { WARCParser } from "warcio";
|
||||
// import { ZSTDDecompress } from "simple-zstd";
|
||||
// import { AutoWARCParser } from "node-warc";
|
||||
|
||||
// const html = await readFile("carrefour.html", "utf-8");
|
||||
// bench("carrefour", () => {
|
||||
// getCarrefourProduct(html);
|
||||
// });
|
||||
|
||||
// await bench("warcio", async () => {
|
||||
// const warc = Bun.spawn(
|
||||
// ["zstd", "-do", "/dev/stdout", "../data/carrefour.warc.zst"],
|
||||
// {
|
||||
// // stdin: Bun.file().stream(),
|
||||
// }
|
||||
// ).stdout;
|
||||
// // const warc = Bun.file("../data/carrefour.warc").stream(1024 * 1024 * 512);
|
||||
// // const warc = createReadStream("../data/carrefour.warc.zst").pipe(ZSTDDecompress());
|
||||
|
||||
// const parser = new WARCParser(warc);
|
||||
// for await (const record of parser) {
|
||||
// const html = await record.contentText();
|
||||
// }
|
||||
// });
|
||||
|
||||
// await bench("warc", );
|
||||
|
||||
async function bench(name: string, func: () => Promise<void>) {
|
||||
const t0 = performance.now();
|
||||
await func();
|
||||
const t1 = performance.now();
|
||||
console.debug(`${name} took ${t1 - t0}`);
|
||||
}
|
||||
|
||||
// await run({});
|
|
@ -1,6 +1,6 @@
|
|||
import { parseHTML } from "linkedom";
|
||||
import { Precioish, type Precio } from "../scrap.js";
|
||||
import { getProductJsonLd, priceFromMeta } from "../common.js";
|
||||
import { Precioish, type Precio } from "./scrap.js";
|
||||
import { getProductJsonLd, priceFromMeta } from "./common.js";
|
||||
|
||||
function getEanByTable(dom: Window): string {
|
||||
const eanLabelEl = dom.window.document.querySelector(
|
||||
|
@ -55,17 +55,14 @@ export function getCarrefourProduct(html: string | Buffer): Precioish {
|
|||
|
||||
const precioCentavos = priceFromMeta(dom);
|
||||
|
||||
// const productLd = findJsonLd(dom, "Product");
|
||||
const ean = eanFromSeedState(dom);
|
||||
|
||||
const ld = getProductJsonLd(dom);
|
||||
const name = ld.name;
|
||||
const imageUrl = ld.image;
|
||||
const inStock =
|
||||
ld.offers.offers[0].availability === "http://schema.org/InStock";
|
||||
|
||||
return {
|
||||
name,
|
||||
imageUrl,
|
||||
ean,
|
||||
precioCentavos,
|
||||
inStock,
|
|
@ -1,5 +1,5 @@
|
|||
import { parseHTML } from "linkedom";
|
||||
import { type Precioish } from "../scrap.js";
|
||||
import { type Precioish } from "./scrap.js";
|
||||
|
||||
function getEanFromText({ document }: Window) {
|
||||
const potentialEanEls = Array.from(
|
||||
|
@ -34,10 +34,5 @@ export function getCotoProduct(html: string | Buffer): Precioish {
|
|||
const ean = getEanFromText(dom);
|
||||
const precioCentavos = getPriceFromText(dom);
|
||||
|
||||
const name = dom.document.querySelector("h1.product_page")?.textContent;
|
||||
const imageUrl = dom.document.querySelector<HTMLImageElement>(
|
||||
".productImageZoom img"
|
||||
)?.src;
|
||||
|
||||
return { name, imageUrl, ean, precioCentavos };
|
||||
return { ean, precioCentavos };
|
||||
}
|
|
@ -1,6 +1,6 @@
|
|||
import { parseHTML } from "linkedom";
|
||||
import { type Precioish } from "../scrap.js";
|
||||
import { getMetaProp, getProductJsonLd, priceFromMeta } from "../common.js";
|
||||
import { type Precioish } from "./scrap.js";
|
||||
import { getMetaProp, getProductJsonLd, priceFromMeta } from "./common.js";
|
||||
|
||||
export function getDiaProduct(html: string | Buffer): Precioish {
|
||||
const dom = parseHTML(html);
|
||||
|
@ -10,14 +10,10 @@ export function getDiaProduct(html: string | Buffer): Precioish {
|
|||
const precioCentavos = priceFromMeta(dom);
|
||||
|
||||
const ld = getProductJsonLd(dom);
|
||||
const name = ld.name;
|
||||
const imageUrl = ld.image;
|
||||
const inStock =
|
||||
ld.offers.offers[0].availability === "http://schema.org/InStock";
|
||||
|
||||
return {
|
||||
name,
|
||||
imageUrl,
|
||||
ean,
|
||||
precioCentavos,
|
||||
inStock,
|
27
scraper/downloadUrls.ts
Normal file
27
scraper/downloadUrls.ts
Normal file
|
@ -0,0 +1,27 @@
|
|||
import { readFile, writeFile } from "fs/promises";
|
||||
import pMap from "p-map";
|
||||
import { nanoid } from "nanoid";
|
||||
import { getHtml } from "./fetch.js";
|
||||
import { join } from "path";
|
||||
|
||||
(async () => {
|
||||
const inputPath = process.argv[2];
|
||||
const outputPath = process.argv[3];
|
||||
if (!inputPath || !outputPath) {
|
||||
console.error("falta input y/o output");
|
||||
process.exit(1);
|
||||
}
|
||||
const file = await readFile(inputPath, "utf-8");
|
||||
const urls = file.split("\n");
|
||||
|
||||
await pMap(
|
||||
urls,
|
||||
async (url: string) => {
|
||||
const id = nanoid();
|
||||
const html = await getHtml(url);
|
||||
await writeFile(join(outputPath, `${id}.link`), url);
|
||||
await writeFile(join(outputPath, id), html);
|
||||
},
|
||||
{ concurrency: 12 }
|
||||
);
|
||||
})();
|
|
@ -1,18 +1,20 @@
|
|||
/// <reference lib="dom" />
|
||||
/// <reference lib="dom.iterable" />
|
||||
import { Database } from "bun:sqlite";
|
||||
import { drizzle } from "drizzle-orm/bun-sqlite";
|
||||
import * as schema from "db-datos/schema.js";
|
||||
import { WARCParser } from "warcio";
|
||||
import { writeFile } from "fs/promises";
|
||||
import { createHash } from "crypto";
|
||||
import { getCarrefourProduct } from "./parsers/carrefour.js";
|
||||
import { getDiaProduct } from "./parsers/dia.js";
|
||||
import { getCotoProduct } from "./parsers/coto.js";
|
||||
import { getCarrefourProduct } from "./carrefour.js";
|
||||
import { getDiaProduct } from "./dia.js";
|
||||
import { getCotoProduct } from "./coto.js";
|
||||
import { join } from "path";
|
||||
import pMap from "p-map";
|
||||
import { and, eq, sql } from "drizzle-orm";
|
||||
|
||||
const DEBUG = false;
|
||||
const PARSER_VERSION = 2;
|
||||
const PARSER_VERSION = 1;
|
||||
|
||||
const sqlite = new Database("sqlite.db");
|
||||
const db = drizzle(sqlite, { schema });
|
||||
|
|
|
@ -1,10 +0,0 @@
|
|||
FROM docker.io/oven/bun:1-alpine
|
||||
COPY build/ .
|
||||
RUN bun i
|
||||
EXPOSE 3000
|
||||
|
||||
# https://github.com/gornostay25/svelte-adapter-bun/issues/39
|
||||
ENV PROTOCOL_HEADER=x-forwarded-proto
|
||||
ENV HOST_HEADER=x-forwarded-host
|
||||
|
||||
CMD ["bun", "run", "start"]
|
|
@ -12,9 +12,10 @@
|
|||
"format": "prettier --write ."
|
||||
},
|
||||
"devDependencies": {
|
||||
"@sveltejs/adapter-node": "^2.0.2",
|
||||
"@sveltejs/kit": "^2.0.0",
|
||||
"@sveltejs/vite-plugin-svelte": "^3.0.0",
|
||||
"@types/bun": "^1.0.0",
|
||||
"@types/better-sqlite3": "^7.6.8",
|
||||
"autoprefixer": "^10.4.16",
|
||||
"db-datos": "workspace:^",
|
||||
"postcss": "^8.4.32",
|
||||
|
@ -23,7 +24,6 @@
|
|||
"prettier-plugin-svelte": "^3.1.2",
|
||||
"prettier-plugin-tailwindcss": "^0.5.9",
|
||||
"svelte": "^4.2.7",
|
||||
"svelte-adapter-bun": "^0.5.1",
|
||||
"svelte-check": "^3.6.0",
|
||||
"tailwindcss": "^3.3.6",
|
||||
"tslib": "^2.4.1",
|
||||
|
@ -32,6 +32,7 @@
|
|||
},
|
||||
"type": "module",
|
||||
"dependencies": {
|
||||
"better-sqlite3": "^9.2.2",
|
||||
"chart.js": "^4.4.1",
|
||||
"chartjs-adapter-dayjs-4": "^1.0.4",
|
||||
"dayjs": "^1.11.10",
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
import Database from "bun:sqlite";
|
||||
import { drizzle } from "drizzle-orm/bun-sqlite";
|
||||
import Database from "better-sqlite3";
|
||||
import { drizzle } from "drizzle-orm/better-sqlite3";
|
||||
import * as schema from "db-datos/schema.js";
|
||||
import { env } from "$env/dynamic/private";
|
||||
|
||||
|
|
|
@ -1,17 +1,18 @@
|
|||
import { error } from "@sveltejs/kit";
|
||||
import type { PageServerLoad } from "./$types";
|
||||
import { db, schema } from "$lib/server/db";
|
||||
const { precios } = schema;
|
||||
import { sql } from "drizzle-orm";
|
||||
import { ilike, like, sql } from "drizzle-orm";
|
||||
|
||||
export const load: PageServerLoad = async ({ params }) => {
|
||||
const q = db
|
||||
.select({ ean: precios.ean, name: precios.name })
|
||||
.from(precios)
|
||||
.groupBy(precios.ean)
|
||||
.having(sql`max(length(name))`)
|
||||
.select({ ean: schema.precios.ean })
|
||||
.from(schema.precios)
|
||||
.where(
|
||||
like(schema.precios.url, `https://diaonline.supermercadosdia.com.ar%`),
|
||||
)
|
||||
.groupBy(schema.precios.ean)
|
||||
.orderBy(sql`random()`)
|
||||
.limit(150);
|
||||
const res = await q;
|
||||
return { precios: res };
|
||||
const precios = await q;
|
||||
return { precios };
|
||||
};
|
||||
|
|
|
@ -10,7 +10,7 @@
|
|||
{#each data.precios as product}
|
||||
<li>
|
||||
<a href={`/ean/${product.ean}`}>
|
||||
{product.name}
|
||||
{product.ean}
|
||||
</a>
|
||||
</li>
|
||||
{/each}
|
||||
|
|
|
@ -1,20 +1,13 @@
|
|||
import { error } from "@sveltejs/kit";
|
||||
import { eq, max } from "drizzle-orm";
|
||||
import { eq } from "drizzle-orm";
|
||||
import type { PageServerLoad } from "./$types";
|
||||
import { db, schema } from "$lib/server/db";
|
||||
const { precios } = schema;
|
||||
|
||||
export const load: PageServerLoad = async ({ params }) => {
|
||||
const q = db
|
||||
.select()
|
||||
.from(precios)
|
||||
.where(eq(precios.ean, params.ean))
|
||||
.groupBy(precios.warcRecordId)
|
||||
.having(max(precios.parserVersion));
|
||||
const res = await q;
|
||||
if (res.length === 0) return error(404, "Not Found");
|
||||
const precios = await db.query.precios.findMany({
|
||||
where: eq(schema.precios.ean, params.ean),
|
||||
});
|
||||
if (precios.length === 0) return error(404, "Not Found");
|
||||
|
||||
const meta = res.find((p) => p.name);
|
||||
|
||||
return { precios: res, meta };
|
||||
return { precios };
|
||||
};
|
||||
|
|
|
@ -5,11 +5,6 @@
|
|||
export let data: PageData;
|
||||
</script>
|
||||
|
||||
{#if data.meta}
|
||||
<h1 class="text-3xl font-bold">{data.meta.name}</h1>
|
||||
<img src={data.meta.imageUrl} class="max-h-48" />
|
||||
{/if}
|
||||
|
||||
<ul>
|
||||
{#each data.precios as precio}
|
||||
<li>
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
// import adapter from "@sveltejs/adapter-node";
|
||||
import adapter from "svelte-adapter-bun";
|
||||
import adapter from "@sveltejs/adapter-node";
|
||||
// import adapter from "svelte-adapter-bun";
|
||||
import { vitePreprocess } from "@sveltejs/vite-plugin-svelte";
|
||||
|
||||
/** @type {import('@sveltejs/kit').Config} */
|
||||
|
|
Loading…
Reference in a new issue