Compare commits

..

No commits in common. "e97b35571533c1d35aeae3a333eb8d75f0b7ec0b" and "958daf0abd40b4a49501c146bce72a7a9bfb5162" have entirely different histories.

9 changed files with 38 additions and 65 deletions

View file

@ -10,7 +10,7 @@ env:
jobs: jobs:
check: check:
name: chequear typescript del sitio name: chequear typescript
runs-on: ubuntu-latest runs-on: ubuntu-latest
steps: steps:
- uses: actions/checkout@v3 - uses: actions/checkout@v3
@ -29,7 +29,6 @@ jobs:
working-directory: ./sitio working-directory: ./sitio
build-and-push-sitio: build-and-push-sitio:
name: Compilar contenedor del sitio
needs: check needs: check
runs-on: ubuntu-latest runs-on: ubuntu-latest
permissions: permissions:
@ -38,32 +37,26 @@ jobs:
steps: steps:
- name: Checkout repository - name: Checkout repository
uses: actions/checkout@v4 uses: actions/checkout@v4
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3
- name: Log in to the Container registry - name: Log in to the Container registry
uses: docker/login-action@v3 uses: docker/login-action@65b78e6e13532edd9afa3aa52ac7964289d1a9c1
with: with:
registry: ${{ env.REGISTRY }} registry: ${{ env.REGISTRY }}
username: ${{ github.actor }} username: ${{ github.actor }}
password: ${{ secrets.GITHUB_TOKEN }} password: ${{ secrets.GITHUB_TOKEN }}
- name: Extract metadata (tags, labels) for Docker - name: Extract metadata (tags, labels) for Docker
id: meta id: meta
uses: docker/metadata-action@v5 uses: docker/metadata-action@9ec57ed1fcdbf14dcef7dfbe97b2010124a938b7
with: with:
images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}/sitio images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}/sitio
- name: Build and push Docker image - name: Build and push Docker image
uses: docker/build-push-action@v5 uses: docker/build-push-action@f2a1d5e99d037542a71f64918e516c093c6f3fc4
with: with:
context: . context: .
file: Dockerfile
push: true push: true
tags: ${{ steps.meta.outputs.tags }} tags: ${{ steps.meta.outputs.tags }}
labels: ${{ steps.meta.outputs.labels }} labels: ${{ steps.meta.outputs.labels }}
cache-from: type=registry,ref=${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}/sitio:buildcache
cache-to: type=registry,ref=${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}/sitio:buildcache,mode=max
build-and-push-scraper: build-and-push-scraper:
name: Compilar contenedor del scraper
runs-on: ubuntu-latest runs-on: ubuntu-latest
permissions: permissions:
contents: read contents: read
@ -102,5 +95,5 @@ jobs:
push: true push: true
tags: ${{ steps.meta.outputs.tags }} tags: ${{ steps.meta.outputs.tags }}
labels: ${{ steps.meta.outputs.labels }} labels: ${{ steps.meta.outputs.labels }}
cache-from: type=registry,ref=${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}/scraper:buildcache cache-from: type=registry,ref=${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:buildcache
cache-to: type=registry,ref=${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}/scraper:buildcache,mode=max cache-to: type=registry,ref=${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:buildcache,mode=max

View file

@ -1,26 +1,29 @@
FROM cgr.dev/chainguard/wolfi-base AS base FROM docker.io/oven/bun:1-alpine AS base
WORKDIR /usr/src/app WORKDIR /usr/src/app
FROM base as build FROM base as build
RUN apk add --no-cache nodejs npm
RUN npm install --global pnpm
COPY . .
COPY db-datos/drizzle .
RUN cd sitio && \
pnpm install && \
pnpm build
FROM base
ENV NODE_ENV=production ENV NODE_ENV=production
RUN apk add --no-cache nodejs npm jq sqlite RUN apk add --no-cache nodejs
COPY . .
RUN bun install --frozen-lockfile
RUN cd sitio && \
bun run build
RUN bun build scraper/cli.ts --target=bun --outfile=/tmp/cli.build.js
FROM cgr.dev/chainguard/wolfi-base
RUN apk add --no-cache nodejs npm jq bun sqlite
# Sitio # Sitio
COPY --from=build /usr/src/app/sitio/package.json package.real.json COPY --from=build /usr/src/app/sitio/package.json package.real.json
RUN sh -c 'echo {\"name\":\"sitio\",\"type\":\"module\",\"dependencies\":$(jq .dependencies < package.real.json)} > package.json' && npm install RUN sh -c 'echo {\"name\":\"sitio\",\"type\":\"module\",\"dependencies\":$(jq .dependencies < package.real.json)} > package.json' && npm install
COPY --from=build /usr/src/app/db-datos node_modules/db-datos COPY --from=build /usr/src/app/db-datos node_modules/db-datos
COPY --from=build /usr/src/app/sitio/build . COPY --from=build /usr/src/app/sitio/build .
COPY --from=build /usr/src/app/db-datos/drizzle .
# Scraper
COPY --from=build /tmp/cli.build.js /bin/scraper
COPY --from=build /usr/src/app/db-datos/drizzle /bin/drizzle
ENV NODE_ENV=production
ENV DB_PATH=/db/db.db ENV DB_PATH=/db/db.db
EXPOSE 3000 EXPOSE 3000

View file

@ -2,7 +2,10 @@
import Database from "better-sqlite3"; import Database from "better-sqlite3";
import { drizzle } from "drizzle-orm/better-sqlite3"; import { drizzle } from "drizzle-orm/better-sqlite3";
import { DB_PATH } from "./drizzle.config.js"; import { DB_PATH } from "./drizzle.config.js";
import { migrateDb } from "./migrate.js";
import * as schema from "./schema.js"; import * as schema from "./schema.js";
migrateDb();
export const sqlite = new Database(DB_PATH); export const sqlite = new Database(DB_PATH);
export const db = drizzle(sqlite, { schema }); export const db = drizzle(sqlite, { schema });

10
scraper-rs/Cargo.lock generated
View file

@ -902,15 +902,6 @@ dependencies = [
"unicode-ident", "unicode-ident",
] ]
[[package]]
name = "quick-xml"
version = "0.31.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1004a344b30a54e2ee58d66a71b32d2db2feb0a31f9a2d302bf0536f15de2a33"
dependencies = [
"memchr",
]
[[package]] [[package]]
name = "quote" name = "quote"
version = "1.0.35" version = "1.0.35"
@ -1141,7 +1132,6 @@ dependencies = [
"futures", "futures",
"itertools", "itertools",
"nanoid", "nanoid",
"quick-xml",
"rand 0.8.5", "rand 0.8.5",
"reqwest", "reqwest",
"rusqlite", "rusqlite",

View file

@ -14,7 +14,6 @@ deadpool-sqlite = "0.7.0"
futures = "0.3.30" futures = "0.3.30"
itertools = "0.12.0" itertools = "0.12.0"
nanoid = "0.4.0" nanoid = "0.4.0"
quick-xml = "0.31.0"
rand = "0.8.5" rand = "0.8.5"
reqwest = { version = "0.11.23", default-features = false, features = [ reqwest = { version = "0.11.23", default-features = false, features = [
"rustls-tls", "rustls-tls",

View file

@ -107,34 +107,14 @@ pub fn in_stock_from_meta(dom: &VDom) -> anyhow::Result<bool> {
pub fn parse_urls_from_sitemap(sitemap: &str) -> anyhow::Result<Vec<String>> { pub fn parse_urls_from_sitemap(sitemap: &str) -> anyhow::Result<Vec<String>> {
let dom = tl::parse(sitemap, tl::ParserOptions::default())?; let dom = tl::parse(sitemap, tl::ParserOptions::default())?;
dom.query_selector("loc") Ok(dom
.query_selector("loc")
.unwrap() .unwrap()
.filter_map(|h| h.get(dom.parser())) .filter_map(|h| h.get(dom.parser()))
.filter_map(|n| n.as_tag()) .filter_map(|n| n.as_tag())
.map(|t| t.inner_text(dom.parser())) .map(|t| t.inner_text(dom.parser()))
.map(|s| -> anyhow::Result<String> { .map(|s| s.to_string())
Ok(quick_xml::escape::unescape(s.as_ref())?.to_string()) .collect())
})
.try_collect()
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_decode_url() -> anyhow::Result<()> {
let links = parse_urls_from_sitemap(
r#"
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9" xmlns:xhtml="http://www.w3.org/1999/xhtml">
<url>
<loc>https://www.carrefour.com.ar/postre-danette-mousse-dulce-de-leche-80-g&#x200B;-684952/p</loc>
<lastmod>2024-01-12T10:41:25.962Z</lastmod>
</url>"#,
)?;
assert_eq!(links[0], "https://www.carrefour.com.ar/postre-danette-mousse-dulce-de-leche-80-g\u{200b}-684952/p");
Ok(())
}
} }
pub async fn get_urls_from_sitemap(sitemaps: Vec<&str>) -> anyhow::Result<Vec<String>> { pub async fn get_urls_from_sitemap(sitemaps: Vec<&str>) -> anyhow::Result<Vec<String>> {

View file

@ -1,2 +1,10 @@
export { db } from "db-datos/db.js"; import Database from "better-sqlite3";
import { drizzle } from "drizzle-orm/better-sqlite3";
import * as schema from "db-datos/schema.js";
import { env } from "$env/dynamic/private";
const sqlite = new Database(env.DB_PATH ?? "../scraper/sqlite.db");
const db = drizzle(sqlite, { schema });
export { db };
export * as schema from "db-datos/schema.js"; export * as schema from "db-datos/schema.js";

View file

@ -9,9 +9,7 @@
"skipLibCheck": true, "skipLibCheck": true,
"sourceMap": true, "sourceMap": true,
"strict": true, "strict": true,
"moduleResolution": "bundler", "moduleResolution": "bundler"
"maxNodeModuleJsDepth": 1,
"declaration": true
} }
// Path aliases are handled by https://kit.svelte.dev/docs/configuration#alias // Path aliases are handled by https://kit.svelte.dev/docs/configuration#alias
// //

View file

@ -11,7 +11,6 @@
"strict": true, "strict": true,
"skipLibCheck": true, "skipLibCheck": true,
"esModuleInterop": true, "esModuleInterop": true,
"allowJs": true,
"checkJs": true, "checkJs": true,
"noEmit": true, "noEmit": true,
"forceConsistentCasingInFileNames": true "forceConsistentCasingInFileNames": true