mirror of
https://github.com/catdevnull/preciazo.git
synced 2024-11-22 14:16:19 +00:00
Compare commits
5 commits
c4b49814fb
...
5c52a12fdf
Author | SHA1 | Date | |
---|---|---|---|
5c52a12fdf | |||
47f566cd82 | |||
97d94037e3 | |||
4d3793ddad | |||
951ac32368 |
6 changed files with 11 additions and 4 deletions
|
@ -4,4 +4,7 @@ data/carrefour/
|
||||||
downloader/
|
downloader/
|
||||||
node_modules/
|
node_modules/
|
||||||
*/node_modules/
|
*/node_modules/
|
||||||
*/Containerfile
|
*/Containerfile
|
||||||
|
*.warc.zst
|
||||||
|
.git
|
||||||
|
scraper/debug/
|
BIN
bun.lockb
BIN
bun.lockb
Binary file not shown.
|
@ -1,4 +1,5 @@
|
||||||
import pMap from "p-map";
|
import pMap from "p-map";
|
||||||
|
import { decodeXML } from "entities";
|
||||||
import { saveUrls } from "db-datos/urlHelpers.js";
|
import { saveUrls } from "db-datos/urlHelpers.js";
|
||||||
|
|
||||||
export async function scrapCarrefourProducts() {
|
export async function scrapCarrefourProducts() {
|
||||||
|
@ -31,7 +32,7 @@ async function scrapBySitemap() {
|
||||||
text(element) {
|
text(element) {
|
||||||
const txt = element.text.trim();
|
const txt = element.text.trim();
|
||||||
if (!txt) return;
|
if (!txt) return;
|
||||||
urls.add(txt);
|
urls.add(decodeXML(txt));
|
||||||
},
|
},
|
||||||
})
|
})
|
||||||
.transform(new Response(xml));
|
.transform(new Response(xml));
|
||||||
|
|
|
@ -1,4 +1,5 @@
|
||||||
import pMap from "p-map";
|
import pMap from "p-map";
|
||||||
|
import { decodeXML } from "entities";
|
||||||
import { parseHTML } from "linkedom";
|
import { parseHTML } from "linkedom";
|
||||||
import { getHtml } from "../scraper/fetch.js";
|
import { getHtml } from "../scraper/fetch.js";
|
||||||
import { saveUrls } from "db-datos/urlHelpers.js";
|
import { saveUrls } from "db-datos/urlHelpers.js";
|
||||||
|
@ -90,7 +91,7 @@ async function scrapBySitemap() {
|
||||||
text(element) {
|
text(element) {
|
||||||
const txt = element.text.trim();
|
const txt = element.text.trim();
|
||||||
if (!txt) return;
|
if (!txt) return;
|
||||||
urls.add(txt);
|
urls.add(decodeXML(txt));
|
||||||
},
|
},
|
||||||
})
|
})
|
||||||
.transform(new Response(xml));
|
.transform(new Response(xml));
|
||||||
|
|
|
@ -17,6 +17,7 @@
|
||||||
"date-fns": "^3.0.6",
|
"date-fns": "^3.0.6",
|
||||||
"db-datos": "workspace:^",
|
"db-datos": "workspace:^",
|
||||||
"drizzle-orm": "=0.29.1",
|
"drizzle-orm": "=0.29.1",
|
||||||
|
"entities": "^4.5.0",
|
||||||
"linkedom": "^0.16.5",
|
"linkedom": "^0.16.5",
|
||||||
"nanoid": "^5.0.4",
|
"nanoid": "^5.0.4",
|
||||||
"p-map": "^7.0.1",
|
"p-map": "^7.0.1",
|
||||||
|
|
|
@ -11,7 +11,8 @@ export const load: PageServerLoad = async ({ url }) => {
|
||||||
results = db.all(
|
results = db.all(
|
||||||
sql`select p.ean, p.name, p.image_url as imageUrl from precios_fts f
|
sql`select p.ean, p.name, p.image_url as imageUrl from precios_fts f
|
||||||
join precios p on p.ean = f.ean
|
join precios p on p.ean = f.ean
|
||||||
where f.name match ${query};`,
|
where f.name match ${query}
|
||||||
|
group by p.ean;`,
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
Loading…
Reference in a new issue