mirror of
https://github.com/catdevnull/preciazo.git
synced 2024-11-26 19:46:19 +00:00
scrapear dia y carrefour via bunjs
This commit is contained in:
parent
5e55ad7131
commit
d85b54f837
8 changed files with 227 additions and 102 deletions
4
.gitignore
vendored
4
.gitignore
vendored
|
@ -3,4 +3,6 @@ data/carrefour
|
||||||
p.*
|
p.*
|
||||||
p
|
p
|
||||||
node_modules/
|
node_modules/
|
||||||
*.db
|
*.db
|
||||||
|
scraper/debug/
|
||||||
|
scraper/x.tsv
|
|
@ -43,7 +43,7 @@ importers:
|
||||||
version: 9.2.2
|
version: 9.2.2
|
||||||
drizzle-orm:
|
drizzle-orm:
|
||||||
specifier: ^0.29.1
|
specifier: ^0.29.1
|
||||||
version: 0.29.1(@types/better-sqlite3@7.6.8)(better-sqlite3@9.2.2)
|
version: 0.29.1(@types/better-sqlite3@7.6.8)(better-sqlite3@9.2.2)(bun-types@1.0.19)
|
||||||
linkedom:
|
linkedom:
|
||||||
specifier: ^0.16.5
|
specifier: ^0.16.5
|
||||||
version: 0.16.5
|
version: 0.16.5
|
||||||
|
@ -59,6 +59,9 @@ importers:
|
||||||
warcio:
|
warcio:
|
||||||
specifier: ^2.2.1
|
specifier: ^2.2.1
|
||||||
version: 2.2.1
|
version: 2.2.1
|
||||||
|
zod:
|
||||||
|
specifier: ^3.22.4
|
||||||
|
version: 3.22.4
|
||||||
devDependencies:
|
devDependencies:
|
||||||
'@types/better-sqlite3':
|
'@types/better-sqlite3':
|
||||||
specifier: ^7.6.8
|
specifier: ^7.6.8
|
||||||
|
@ -66,6 +69,9 @@ importers:
|
||||||
'@types/node':
|
'@types/node':
|
||||||
specifier: ^20.10.5
|
specifier: ^20.10.5
|
||||||
version: 20.10.5
|
version: 20.10.5
|
||||||
|
bun-types:
|
||||||
|
specifier: ^1.0.19
|
||||||
|
version: 1.0.19
|
||||||
tsx:
|
tsx:
|
||||||
specifier: ^4.7.0
|
specifier: ^4.7.0
|
||||||
version: 4.7.0
|
version: 4.7.0
|
||||||
|
@ -274,6 +280,11 @@ packages:
|
||||||
dependencies:
|
dependencies:
|
||||||
undici-types: 5.26.5
|
undici-types: 5.26.5
|
||||||
|
|
||||||
|
/@types/ws@8.5.10:
|
||||||
|
resolution: {integrity: sha512-vmQSUcfalpIq0R9q7uTo2lXs6eGIpt9wtnLdMv9LVpIjCA/+ufZRozlVoVelIYixx1ugCBKDhn89vnsEGOCx9A==}
|
||||||
|
dependencies:
|
||||||
|
'@types/node': 20.10.5
|
||||||
|
|
||||||
/ansi-regex@5.0.1:
|
/ansi-regex@5.0.1:
|
||||||
resolution: {integrity: sha512-quJQXlTSUGL2LH9SUXo8VwsY4soanhgo6LNSm84E1LBcE8s3O0wpdiRzyR9z/ZZJMlMWv37qOOb9pdJlMUEKFQ==}
|
resolution: {integrity: sha512-quJQXlTSUGL2LH9SUXo8VwsY4soanhgo6LNSm84E1LBcE8s3O0wpdiRzyR9z/ZZJMlMWv37qOOb9pdJlMUEKFQ==}
|
||||||
engines: {node: '>=8'}
|
engines: {node: '>=8'}
|
||||||
|
@ -330,6 +341,13 @@ packages:
|
||||||
ieee754: 1.2.1
|
ieee754: 1.2.1
|
||||||
dev: false
|
dev: false
|
||||||
|
|
||||||
|
/bun-types@1.0.19:
|
||||||
|
resolution: {integrity: sha512-7P5/r+twssrkDQ6HMit2GARMBbAxz1tLLEcMgQOCZeCX9BzNtabktjPCu+DmcvDYDnL/Ke75pmKg9CNBTlCzlQ==}
|
||||||
|
dependencies:
|
||||||
|
'@types/node': 20.10.5
|
||||||
|
'@types/ws': 8.5.10
|
||||||
|
undici-types: 5.26.5
|
||||||
|
|
||||||
/chownr@1.1.4:
|
/chownr@1.1.4:
|
||||||
resolution: {integrity: sha512-jJ0bqzaylmJtVnNgzTeSOs8DPavpbYgEr/b0YL8/2GO3xJEhInFmhKMUnEJQjZumK7KXGFhUy89PrsJWlakBVg==}
|
resolution: {integrity: sha512-jJ0bqzaylmJtVnNgzTeSOs8DPavpbYgEr/b0YL8/2GO3xJEhInFmhKMUnEJQjZumK7KXGFhUy89PrsJWlakBVg==}
|
||||||
dev: false
|
dev: false
|
||||||
|
@ -424,7 +442,7 @@ packages:
|
||||||
domhandler: 5.0.3
|
domhandler: 5.0.3
|
||||||
dev: false
|
dev: false
|
||||||
|
|
||||||
/drizzle-orm@0.29.1(@types/better-sqlite3@7.6.8)(better-sqlite3@9.2.2):
|
/drizzle-orm@0.29.1(@types/better-sqlite3@7.6.8)(better-sqlite3@9.2.2)(bun-types@1.0.19):
|
||||||
resolution: {integrity: sha512-yItc4unfHnk8XkDD3/bdC63vdboTY7e7I03lCF1OJYABXSIfQYU9BFTQJXMMovVeb3T1/OJWwfW/70T1XPnuUA==}
|
resolution: {integrity: sha512-yItc4unfHnk8XkDD3/bdC63vdboTY7e7I03lCF1OJYABXSIfQYU9BFTQJXMMovVeb3T1/OJWwfW/70T1XPnuUA==}
|
||||||
peerDependencies:
|
peerDependencies:
|
||||||
'@aws-sdk/client-rds-data': '>=3'
|
'@aws-sdk/client-rds-data': '>=3'
|
||||||
|
@ -488,6 +506,7 @@ packages:
|
||||||
dependencies:
|
dependencies:
|
||||||
'@types/better-sqlite3': 7.6.8
|
'@types/better-sqlite3': 7.6.8
|
||||||
better-sqlite3: 9.2.2
|
better-sqlite3: 9.2.2
|
||||||
|
bun-types: 1.0.19
|
||||||
dev: false
|
dev: false
|
||||||
|
|
||||||
/emoji-regex@8.0.0:
|
/emoji-regex@8.0.0:
|
||||||
|
@ -955,3 +974,7 @@ packages:
|
||||||
y18n: 5.0.8
|
y18n: 5.0.8
|
||||||
yargs-parser: 21.1.1
|
yargs-parser: 21.1.1
|
||||||
dev: false
|
dev: false
|
||||||
|
|
||||||
|
/zod@3.22.4:
|
||||||
|
resolution: {integrity: sha512-iC+8Io04lddc+mVqQ9AZ7OQ2MrUKGN+oIQyq1vemgt46jwCwLfhq7/pwnBnNXXXZb8VTVLKwp9EDkx+ryxIWmg==}
|
||||||
|
dev: false
|
||||||
|
|
70
scraper/carrefour.ts
Normal file
70
scraper/carrefour.ts
Normal file
|
@ -0,0 +1,70 @@
|
||||||
|
import { parseHTML } from "linkedom";
|
||||||
|
import { Precioish, type Precio } from "./scrap.js";
|
||||||
|
import { getProductJsonLd, priceFromMeta } from "./common.js";
|
||||||
|
|
||||||
|
function getEanByTable(dom: Window): string {
|
||||||
|
const eanLabelEl = dom.window.document.querySelector(
|
||||||
|
'td[data-specification="EAN"]'
|
||||||
|
);
|
||||||
|
const eanValueEl = eanLabelEl?.parentElement?.children[1];
|
||||||
|
if (
|
||||||
|
!eanValueEl ||
|
||||||
|
!(eanValueEl instanceof dom.window.HTMLElement) ||
|
||||||
|
!eanValueEl.dataset.specification
|
||||||
|
)
|
||||||
|
throw new Error("No encontré el EAN");
|
||||||
|
return eanValueEl.dataset.specification;
|
||||||
|
}
|
||||||
|
function parseScriptJson<T>(dom: Window, varname: string): T {
|
||||||
|
const script = dom.window.document.querySelector<HTMLTemplateElement>(
|
||||||
|
`template[data-type="json"][data-varname="${varname}"]`
|
||||||
|
)?.content?.children[0];
|
||||||
|
if (!script) throw new Error("no encuentro el script");
|
||||||
|
return JSON.parse(script.innerHTML);
|
||||||
|
}
|
||||||
|
function eanFromSeedState(dom: Window): string {
|
||||||
|
const json = parseScriptJson<object>(dom, "__STATE__");
|
||||||
|
const productJson = Object.entries(json).find(
|
||||||
|
([key, val]) => key.startsWith("Product:") && val.__typename === "Product"
|
||||||
|
);
|
||||||
|
if (!productJson) throw new Error("no encontré el product en el json");
|
||||||
|
|
||||||
|
const productSkuJson = Object.entries(json).find(
|
||||||
|
([key, val]) =>
|
||||||
|
key.startsWith(`Product:${productJson[1].cacheId}`) &&
|
||||||
|
val.__typename === "SKU"
|
||||||
|
);
|
||||||
|
if (!productSkuJson) throw new Error("no encontré el sku en el json");
|
||||||
|
return productSkuJson[1].ean;
|
||||||
|
}
|
||||||
|
function eanFromDynamicYieldScript(dom: Window): string {
|
||||||
|
const scriptEl = dom.window.document.querySelector(
|
||||||
|
`script[src^="//st.dynamicyield.com/st?"]`
|
||||||
|
);
|
||||||
|
if (!scriptEl || !(scriptEl instanceof dom.window.HTMLScriptElement))
|
||||||
|
throw new Error("no encuentro el script de dynamicyield");
|
||||||
|
|
||||||
|
const url = new URL(scriptEl.src);
|
||||||
|
const ctx = url.searchParams.get("ctx");
|
||||||
|
if (!ctx) throw new Error("no hay ctx");
|
||||||
|
return JSON.parse(ctx).data[0];
|
||||||
|
}
|
||||||
|
|
||||||
|
export function getCarrefourProduct(html: string | Buffer): Precioish {
|
||||||
|
const dom = parseHTML(html);
|
||||||
|
|
||||||
|
const precioCentavos = priceFromMeta(dom);
|
||||||
|
|
||||||
|
// const productLd = findJsonLd(dom, "Product");
|
||||||
|
const ean = eanFromSeedState(dom);
|
||||||
|
|
||||||
|
const ld = getProductJsonLd(dom);
|
||||||
|
const inStock =
|
||||||
|
ld.offers.offers[0].availability === "http://schema.org/InStock";
|
||||||
|
|
||||||
|
return {
|
||||||
|
ean,
|
||||||
|
precioCentavos,
|
||||||
|
inStock,
|
||||||
|
};
|
||||||
|
}
|
50
scraper/common.ts
Normal file
50
scraper/common.ts
Normal file
|
@ -0,0 +1,50 @@
|
||||||
|
import { z } from "zod";
|
||||||
|
|
||||||
|
export function getMetaProp(dom: Window, prop: string) {
|
||||||
|
return dom.window.document
|
||||||
|
.querySelector(`meta[property="${prop}"]`)
|
||||||
|
?.getAttribute("content");
|
||||||
|
}
|
||||||
|
|
||||||
|
export function priceFromMeta(dom: Window) {
|
||||||
|
const precioMeta = getMetaProp(dom, "product:price:amount");
|
||||||
|
if (!precioMeta) return null;
|
||||||
|
const precioCentavos = parseFloat(precioMeta) * 100;
|
||||||
|
return precioCentavos;
|
||||||
|
}
|
||||||
|
|
||||||
|
function parseJsonLds(dom: Window): object[] {
|
||||||
|
const scripts = dom.window.document.querySelectorAll(
|
||||||
|
'script[type="application/ld+json"]'
|
||||||
|
);
|
||||||
|
return [...scripts].map((scripts) => JSON.parse(scripts.innerHTML));
|
||||||
|
}
|
||||||
|
function findJsonLd(dom: Window, type: string): object | undefined {
|
||||||
|
return parseJsonLds(dom).find((x) => "@type" in x && x["@type"] === type);
|
||||||
|
}
|
||||||
|
|
||||||
|
const zProductLd = z.object({
|
||||||
|
"@type": z.literal("Product"),
|
||||||
|
name: z.string(),
|
||||||
|
image: z.string(),
|
||||||
|
offers: z.object({
|
||||||
|
offers: z.tuple([
|
||||||
|
z.object({
|
||||||
|
"@type": z.literal("Offer"),
|
||||||
|
price: z.number(),
|
||||||
|
priceCurrency: z.literal("ARS"),
|
||||||
|
availability: z.enum([
|
||||||
|
"http://schema.org/OutOfStock",
|
||||||
|
"http://schema.org/InStock",
|
||||||
|
]),
|
||||||
|
}),
|
||||||
|
]),
|
||||||
|
}),
|
||||||
|
});
|
||||||
|
type ProductLd = z.infer<typeof zProductLd>;
|
||||||
|
|
||||||
|
export function getProductJsonLd(dom: Window): ProductLd {
|
||||||
|
const ld = findJsonLd(dom, "Product");
|
||||||
|
const productLd = zProductLd.parse(ld);
|
||||||
|
return productLd;
|
||||||
|
}
|
21
scraper/dia.ts
Normal file
21
scraper/dia.ts
Normal file
|
@ -0,0 +1,21 @@
|
||||||
|
import { parseHTML } from "linkedom";
|
||||||
|
import { type Precioish } from "./scrap.js";
|
||||||
|
import { getMetaProp, getProductJsonLd, priceFromMeta } from "./common.js";
|
||||||
|
|
||||||
|
export function getDiaProduct(html: string | Buffer): Precioish {
|
||||||
|
const dom = parseHTML(html);
|
||||||
|
|
||||||
|
const ean = getMetaProp(dom, "product:retailer_item_id");
|
||||||
|
if (!ean) throw new Error("No encontré el ean");
|
||||||
|
const precioCentavos = priceFromMeta(dom);
|
||||||
|
|
||||||
|
const ld = getProductJsonLd(dom);
|
||||||
|
const inStock =
|
||||||
|
ld.offers.offers[0].availability === "http://schema.org/InStock";
|
||||||
|
|
||||||
|
return {
|
||||||
|
ean,
|
||||||
|
precioCentavos,
|
||||||
|
inStock,
|
||||||
|
};
|
||||||
|
}
|
|
@ -17,11 +17,13 @@
|
||||||
"nanoid": "^5.0.4",
|
"nanoid": "^5.0.4",
|
||||||
"p-map": "^7.0.0",
|
"p-map": "^7.0.0",
|
||||||
"undici": "^6.2.0",
|
"undici": "^6.2.0",
|
||||||
"warcio": "^2.2.1"
|
"warcio": "^2.2.1",
|
||||||
|
"zod": "^3.22.4"
|
||||||
},
|
},
|
||||||
"devDependencies": {
|
"devDependencies": {
|
||||||
"@types/better-sqlite3": "^7.6.8",
|
"@types/better-sqlite3": "^7.6.8",
|
||||||
"@types/node": "^20.10.5",
|
"@types/node": "^20.10.5",
|
||||||
|
"bun-types": "^1.0.19",
|
||||||
"tsx": "^4.7.0",
|
"tsx": "^4.7.0",
|
||||||
"typescript": "^5.3.3"
|
"typescript": "^5.3.3"
|
||||||
}
|
}
|
||||||
|
|
147
scraper/scrap.ts
147
scraper/scrap.ts
|
@ -1,128 +1,79 @@
|
||||||
/// <reference lib="dom" />
|
/// <reference lib="dom" />
|
||||||
/// <reference lib="dom.iterable" />
|
/// <reference lib="dom.iterable" />
|
||||||
/// <reference types="node" />
|
/// <reference types="node" />
|
||||||
import { parseHTML } from "linkedom";
|
import { Database } from "bun:sqlite";
|
||||||
import { drizzle } from "drizzle-orm/better-sqlite3";
|
import { drizzle } from "drizzle-orm/bun-sqlite";
|
||||||
import Database from "better-sqlite3";
|
|
||||||
import { precios } from "./db/schema.js";
|
import { precios } from "./db/schema.js";
|
||||||
import { WARCParser } from "warcio";
|
import { WARCParser } from "warcio";
|
||||||
import { createReadStream } from "fs";
|
import { createReadStream, createWriteStream } from "fs";
|
||||||
import { writeFile } from "fs/promises";
|
import { writeFile } from "fs/promises";
|
||||||
import { createHash } from "crypto";
|
import { createHash } from "crypto";
|
||||||
|
import { migrate } from "drizzle-orm/bun-sqlite/migrator";
|
||||||
|
import { getCarrefourProduct } from "./carrefour.js";
|
||||||
|
import { getDiaProduct } from "./dia.js";
|
||||||
|
import { join } from "path";
|
||||||
|
|
||||||
const sqlite = new Database("sqlite.db");
|
const sqlite = new Database("sqlite.db");
|
||||||
const db = drizzle(sqlite);
|
const db = drizzle(sqlite);
|
||||||
|
|
||||||
type Precio = typeof precios.$inferInsert;
|
const DEBUG = true;
|
||||||
|
|
||||||
|
export type Precio = typeof precios.$inferInsert;
|
||||||
|
export type Precioish = Omit<Precio, "fetchedAt" | "url">;
|
||||||
|
|
||||||
async function storePrecioPoint(point: Precio) {
|
async function storePrecioPoint(point: Precio) {
|
||||||
await db.insert(precios).values(point);
|
await db.insert(precios).values(point);
|
||||||
}
|
}
|
||||||
|
|
||||||
function getEanByTable(dom: Window): string {
|
|
||||||
const eanLabelEl = dom.window.document.querySelector(
|
|
||||||
'td[data-specification="EAN"]'
|
|
||||||
);
|
|
||||||
const eanValueEl = eanLabelEl?.parentElement?.children[1];
|
|
||||||
if (
|
|
||||||
!eanValueEl ||
|
|
||||||
!(eanValueEl instanceof dom.window.HTMLElement) ||
|
|
||||||
!eanValueEl.dataset.specification
|
|
||||||
)
|
|
||||||
throw new Error("No encontré el EAN");
|
|
||||||
return eanValueEl.dataset.specification;
|
|
||||||
}
|
|
||||||
|
|
||||||
function parseJsonLds(dom: Window): object[] {
|
|
||||||
const scripts = dom.window.document.querySelectorAll(
|
|
||||||
'script[type="application/ld+json"]'
|
|
||||||
);
|
|
||||||
return [...scripts].map((scripts) => JSON.parse(scripts.innerHTML));
|
|
||||||
}
|
|
||||||
function findJsonLd(dom: Window, type: string): object | undefined {
|
|
||||||
return parseJsonLds(dom).find((x) => "@type" in x && x["@type"] === type);
|
|
||||||
}
|
|
||||||
|
|
||||||
function parseScriptJson<T>(dom: Window, varname: string): T {
|
|
||||||
const script = dom.window.document.querySelector<HTMLTemplateElement>(
|
|
||||||
`template[data-type="json"][data-varname="${varname}"]`
|
|
||||||
)?.content?.children[0];
|
|
||||||
if (!script) throw new Error("no encuentro el script");
|
|
||||||
return JSON.parse(script.innerHTML);
|
|
||||||
}
|
|
||||||
|
|
||||||
function eanFromSeedState(dom: Window): string {
|
|
||||||
const json = parseScriptJson<object>(dom, "__STATE__");
|
|
||||||
const productJson = Object.entries(json).find(
|
|
||||||
([key, val]) => key.startsWith("Product:") && val.__typename === "Product"
|
|
||||||
);
|
|
||||||
if (!productJson) throw new Error("no encontré el product en el json");
|
|
||||||
|
|
||||||
const productSkuJson = Object.entries(json).find(
|
|
||||||
([key, val]) =>
|
|
||||||
key.startsWith(`Product:${productJson[1].cacheId}`) &&
|
|
||||||
val.__typename === "SKU"
|
|
||||||
);
|
|
||||||
if (!productSkuJson) throw new Error("no encontré el sku en el json");
|
|
||||||
return productSkuJson[1].ean;
|
|
||||||
}
|
|
||||||
|
|
||||||
function eanFromDynamicYieldScript(dom: Window): string {
|
|
||||||
const scriptEl = dom.window.document.querySelector(
|
|
||||||
`script[src^="//st.dynamicyield.com/st?"]`
|
|
||||||
);
|
|
||||||
if (!scriptEl || !(scriptEl instanceof dom.window.HTMLScriptElement))
|
|
||||||
throw new Error("no encuentro el script de dynamicyield");
|
|
||||||
|
|
||||||
const url = new URL(scriptEl.src);
|
|
||||||
const ctx = url.searchParams.get("ctx");
|
|
||||||
if (!ctx) throw new Error("no hay ctx");
|
|
||||||
return JSON.parse(ctx).data[0];
|
|
||||||
}
|
|
||||||
|
|
||||||
function getCarrefourProduct(html: string | Buffer): Precio {
|
|
||||||
const dom = parseHTML(html);
|
|
||||||
|
|
||||||
const precioMeta = dom.window.document
|
|
||||||
.querySelector(`meta[property="product:price:amount"]`)
|
|
||||||
?.getAttribute("content");
|
|
||||||
if (!precioMeta) throw new Error("No encontré el precio");
|
|
||||||
const precioCentavos = parseFloat(precioMeta) * 100;
|
|
||||||
|
|
||||||
// const productLd = findJsonLd(dom, "Product");
|
|
||||||
const ean = eanFromSeedState(dom);
|
|
||||||
|
|
||||||
return {
|
|
||||||
ean,
|
|
||||||
precioCentavos,
|
|
||||||
fetchedAt: new Date(),
|
|
||||||
};
|
|
||||||
}
|
|
||||||
|
|
||||||
(async () => {
|
(async () => {
|
||||||
// await migrate(db, { migrationsFolder: "./drizzle" });
|
const o = createWriteStream("x.tsv");
|
||||||
// const p = await getCarrefourProduct(
|
o.write(`ean\tfetchedAt\tprecioCentavos\tinStock\turl\n`);
|
||||||
// "https://www.carrefour.com.ar/bebida-lactea-la-serenisima-ultra-0-grasa-vainilla-900-cc/p"
|
|
||||||
// );
|
|
||||||
// await storePrecioPoint(p);
|
|
||||||
|
|
||||||
const warc = createReadStream(process.argv[2]);
|
const warc = createReadStream(process.argv[2]);
|
||||||
const parser = new WARCParser(warc);
|
const parser = new WARCParser(warc);
|
||||||
|
let progress = { done: 0, errors: 0 };
|
||||||
for await (const record of parser) {
|
for await (const record of parser) {
|
||||||
if (record.warcType === "response") {
|
if (record.warcType === "response") {
|
||||||
|
if (!record.warcTargetURI) throw new Error("no uri");
|
||||||
console.log(record.warcTargetURI);
|
console.log(record.warcTargetURI);
|
||||||
const html = await record.contentText();
|
const html = await record.contentText();
|
||||||
|
|
||||||
|
const url = new URL(record.warcTargetURI);
|
||||||
try {
|
try {
|
||||||
const product = getCarrefourProduct(html);
|
let ish: Precioish | undefined = undefined;
|
||||||
console.log(product);
|
if (url.hostname === "www.carrefour.com.ar")
|
||||||
|
ish = getCarrefourProduct(html);
|
||||||
|
else if (url.hostname === "diaonline.supermercadosdia.com.ar")
|
||||||
|
ish = getDiaProduct(html);
|
||||||
|
else console.error(`Unknown host ${url.hostname}`);
|
||||||
|
|
||||||
|
const p: Precio = {
|
||||||
|
...ish,
|
||||||
|
fetchedAt: new Date(record.warcDate!),
|
||||||
|
url: record.warcTargetURI,
|
||||||
|
};
|
||||||
|
|
||||||
|
if (ish)
|
||||||
|
o.write(
|
||||||
|
`${p.ean}\t${p.fetchedAt}\t${p.precioCentavos}\t${p.inStock}\t${p.url}\n`
|
||||||
|
);
|
||||||
|
|
||||||
|
// console.log(product);
|
||||||
|
progress.done++;
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
console.error(error);
|
console.error(error);
|
||||||
const urlHash = createHash("md5")
|
progress.errors++;
|
||||||
.update(record.warcTargetURI!)
|
|
||||||
.digest("hex");
|
if (DEBUG) {
|
||||||
const output = `${urlHash}.html`;
|
const urlHash = createHash("md5")
|
||||||
await writeFile(output, html);
|
.update(record.warcTargetURI!)
|
||||||
console.error(`wrote html to ${output}`);
|
.digest("hex");
|
||||||
|
const output = join("debug", `${urlHash}.html`);
|
||||||
|
await writeFile(output, html);
|
||||||
|
console.error(`wrote html to ${output}`);
|
||||||
|
}
|
||||||
|
} finally {
|
||||||
|
console.debug(progress);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
6
scraper/tsconfig.json
Normal file
6
scraper/tsconfig.json
Normal file
|
@ -0,0 +1,6 @@
|
||||||
|
{
|
||||||
|
"extends": "../tsconfig.json",
|
||||||
|
"compilerOptions": {
|
||||||
|
"types": ["bun-types"]
|
||||||
|
}
|
||||||
|
}
|
Loading…
Reference in a new issue