mirror of
https://github.com/catdevnull/preciazo.git
synced 2025-02-23 22:46:28 +00:00
saltearse volver a scrapear algo que ya existe
This commit is contained in:
parent
106dee13ac
commit
c004be4a31
1 changed files with 21 additions and 3 deletions
|
@ -11,6 +11,7 @@ import { getDiaProduct } from "./dia.js";
|
||||||
import { getCotoProduct } from "./coto.js";
|
import { getCotoProduct } from "./coto.js";
|
||||||
import { join } from "path";
|
import { join } from "path";
|
||||||
import pMap from "p-map";
|
import pMap from "p-map";
|
||||||
|
import { and, eq, sql } from "drizzle-orm";
|
||||||
|
|
||||||
const DEBUG = false;
|
const DEBUG = false;
|
||||||
const PARSER_VERSION = 1;
|
const PARSER_VERSION = 1;
|
||||||
|
@ -22,6 +23,17 @@ sqlite.run(`
|
||||||
pragma journal_mode = WAL;
|
pragma journal_mode = WAL;
|
||||||
PRAGMA synchronous = NORMAL;
|
PRAGMA synchronous = NORMAL;
|
||||||
`);
|
`);
|
||||||
|
const getPrevPrecio = db
|
||||||
|
.select({ id: schema.precios.id })
|
||||||
|
.from(schema.precios)
|
||||||
|
.where(
|
||||||
|
and(
|
||||||
|
eq(schema.precios.warcRecordId, sql.placeholder("warcRecordId")),
|
||||||
|
eq(schema.precios.parserVersion, PARSER_VERSION)
|
||||||
|
)
|
||||||
|
)
|
||||||
|
.limit(1)
|
||||||
|
.prepare();
|
||||||
|
|
||||||
let progress = { done: 0, errors: 0 };
|
let progress = { done: 0, errors: 0 };
|
||||||
await pMap(process.argv.slice(2), (path) => parseWarc(path), {
|
await pMap(process.argv.slice(2), (path) => parseWarc(path), {
|
||||||
|
@ -49,9 +61,15 @@ async function parseWarc(path: string) {
|
||||||
for await (const record of parser) {
|
for await (const record of parser) {
|
||||||
if (record.warcType === "response") {
|
if (record.warcType === "response") {
|
||||||
if (!record.warcTargetURI) continue;
|
if (!record.warcTargetURI) continue;
|
||||||
|
const warcRecordId = record.warcHeader("WARC-Record-ID");
|
||||||
|
if (!warcRecordId) throw new Error("No tiene WARC-Record-ID");
|
||||||
|
|
||||||
|
if (getPrevPrecio.get({ warcRecordId })) {
|
||||||
|
console.debug(`skipped ${warcRecordId}`);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
// TODO: sobreescribir si existe el mismo record-id pero con version mas bajo?
|
||||||
|
|
||||||
// TODO: saltear si ya existe el record-id con el mismo parser version
|
|
||||||
// y sobreescribir si existe el mismo record-id pero con version mas bajo?
|
|
||||||
const html = await record.contentText();
|
const html = await record.contentText();
|
||||||
|
|
||||||
const url = new URL(record.warcTargetURI);
|
const url = new URL(record.warcTargetURI);
|
||||||
|
@ -69,7 +87,7 @@ async function parseWarc(path: string) {
|
||||||
...ish,
|
...ish,
|
||||||
fetchedAt: new Date(record.warcDate!),
|
fetchedAt: new Date(record.warcDate!),
|
||||||
url: record.warcTargetURI,
|
url: record.warcTargetURI,
|
||||||
warcRecordId: record.warcHeader("WARC-Record-ID"),
|
warcRecordId,
|
||||||
parserVersion: PARSER_VERSION,
|
parserVersion: PARSER_VERSION,
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
Loading…
Reference in a new issue