Compare commits

...

5 commits

Author SHA1 Message Date
5c52a12fdf juntar por ean busqueda
fixes #11
2024-01-02 10:34:43 -03:00
47f566cd82 bunlock 2024-01-02 10:29:29 -03:00
97d94037e3 parsear urls xml 2024-01-02 10:28:55 -03:00
4d3793ddad chore: dockerignore 2024-01-02 00:24:04 -03:00
951ac32368 chore 2024-01-02 00:23:30 -03:00
6 changed files with 11 additions and 4 deletions

View file

@ -4,4 +4,7 @@ data/carrefour/
downloader/ downloader/
node_modules/ node_modules/
*/node_modules/ */node_modules/
*/Containerfile */Containerfile
*.warc.zst
.git
scraper/debug/

BIN
bun.lockb

Binary file not shown.

View file

@ -1,4 +1,5 @@
import pMap from "p-map"; import pMap from "p-map";
import { decodeXML } from "entities";
import { saveUrls } from "db-datos/urlHelpers.js"; import { saveUrls } from "db-datos/urlHelpers.js";
export async function scrapCarrefourProducts() { export async function scrapCarrefourProducts() {
@ -31,7 +32,7 @@ async function scrapBySitemap() {
text(element) { text(element) {
const txt = element.text.trim(); const txt = element.text.trim();
if (!txt) return; if (!txt) return;
urls.add(txt); urls.add(decodeXML(txt));
}, },
}) })
.transform(new Response(xml)); .transform(new Response(xml));

View file

@ -1,4 +1,5 @@
import pMap from "p-map"; import pMap from "p-map";
import { decodeXML } from "entities";
import { parseHTML } from "linkedom"; import { parseHTML } from "linkedom";
import { getHtml } from "../scraper/fetch.js"; import { getHtml } from "../scraper/fetch.js";
import { saveUrls } from "db-datos/urlHelpers.js"; import { saveUrls } from "db-datos/urlHelpers.js";
@ -90,7 +91,7 @@ async function scrapBySitemap() {
text(element) { text(element) {
const txt = element.text.trim(); const txt = element.text.trim();
if (!txt) return; if (!txt) return;
urls.add(txt); urls.add(decodeXML(txt));
}, },
}) })
.transform(new Response(xml)); .transform(new Response(xml));

View file

@ -17,6 +17,7 @@
"date-fns": "^3.0.6", "date-fns": "^3.0.6",
"db-datos": "workspace:^", "db-datos": "workspace:^",
"drizzle-orm": "=0.29.1", "drizzle-orm": "=0.29.1",
"entities": "^4.5.0",
"linkedom": "^0.16.5", "linkedom": "^0.16.5",
"nanoid": "^5.0.4", "nanoid": "^5.0.4",
"p-map": "^7.0.1", "p-map": "^7.0.1",

View file

@ -11,7 +11,8 @@ export const load: PageServerLoad = async ({ url }) => {
results = db.all( results = db.all(
sql`select p.ean, p.name, p.image_url as imageUrl from precios_fts f sql`select p.ean, p.name, p.image_url as imageUrl from precios_fts f
join precios p on p.ean = f.ean join precios p on p.ean = f.ean
where f.name match ${query};`, where f.name match ${query}
group by p.ean;`,
); );
} }