mirror of
https://github.com/catdevnull/preciazo.git
synced 2024-11-26 19:46:19 +00:00
mejoras scrap
This commit is contained in:
parent
95eac9dcca
commit
8f8e133b5f
1 changed files with 15 additions and 10 deletions
|
@ -14,17 +14,19 @@ import { getCotoProduct } from "./coto.js";
|
||||||
import { join } from "path";
|
import { join } from "path";
|
||||||
import pMap from "p-map";
|
import pMap from "p-map";
|
||||||
|
|
||||||
|
const DEBUG = false;
|
||||||
|
|
||||||
const sqlite = new Database("sqlite.db");
|
const sqlite = new Database("sqlite.db");
|
||||||
const db = drizzle(sqlite);
|
const db = drizzle(sqlite);
|
||||||
|
|
||||||
sqlite.run(`
|
sqlite.run(`
|
||||||
pragma journal_mode = WAL;
|
pragma journal_mode = OFF;
|
||||||
pragma synchronous = normal;
|
pragma synchronous = 0;
|
||||||
pragma temp_store = memory;
|
pragma cache_size = 1000000;
|
||||||
pragma mmap_size = 30000000000;
|
pragma locking_mode = exclusive;
|
||||||
`);
|
`);
|
||||||
sqlite.run(`
|
sqlite.run(`
|
||||||
create table precios(
|
create table if not exists precios(
|
||||||
id integer primary key autoincrement,
|
id integer primary key autoincrement,
|
||||||
ean text not null,
|
ean text not null,
|
||||||
fetched_at text not null,
|
fetched_at text not null,
|
||||||
|
@ -34,12 +36,11 @@ create table precios(
|
||||||
);
|
);
|
||||||
`);
|
`);
|
||||||
|
|
||||||
|
let progress = { done: 0, errors: 0 };
|
||||||
await pMap(process.argv.slice(2), (path) => parseWarc(path), {
|
await pMap(process.argv.slice(2), (path) => parseWarc(path), {
|
||||||
concurrency: 40,
|
concurrency: 40,
|
||||||
});
|
});
|
||||||
|
|
||||||
const DEBUG = false;
|
|
||||||
|
|
||||||
export type Precio = typeof precios.$inferInsert;
|
export type Precio = typeof precios.$inferInsert;
|
||||||
export type Precioish = Omit<Precio, "fetchedAt" | "url" | "id">;
|
export type Precioish = Omit<Precio, "fetchedAt" | "url" | "id">;
|
||||||
|
|
||||||
|
@ -48,12 +49,16 @@ async function storePrecioPoint(point: Precio) {
|
||||||
}
|
}
|
||||||
|
|
||||||
async function parseWarc(path: string) {
|
async function parseWarc(path: string) {
|
||||||
const warc = createReadStream(path);
|
// const warc = createReadStream(path);
|
||||||
|
|
||||||
|
const warc = Bun.spawn(["zstd", "-do", "/dev/stdout", path], {
|
||||||
|
stderr: "ignore",
|
||||||
|
}).stdout;
|
||||||
|
|
||||||
const parser = new WARCParser(warc);
|
const parser = new WARCParser(warc);
|
||||||
let progress = { done: 0, errors: 0 };
|
|
||||||
for await (const record of parser) {
|
for await (const record of parser) {
|
||||||
if (record.warcType === "response") {
|
if (record.warcType === "response") {
|
||||||
if (!record.warcTargetURI) throw new Error("no uri");
|
if (!record.warcTargetURI) continue;
|
||||||
const html = await record.contentText();
|
const html = await record.contentText();
|
||||||
|
|
||||||
const url = new URL(record.warcTargetURI);
|
const url = new URL(record.warcTargetURI);
|
||||||
|
|
Loading…
Reference in a new issue