mirror of
https://github.com/catdevnull/preciazo.git
synced 2024-11-26 03:26:19 +00:00
Compare commits
9 commits
958daf0abd
...
e97b355715
Author | SHA1 | Date | |
---|---|---|---|
e97b355715 | |||
dade60d677 | |||
d2dbd3c093 | |||
fe9b1fd3e7 | |||
71d3429450 | |||
1214aa5fbe | |||
c52ed1d569 | |||
d805ebbe72 | |||
e2885a59e0 |
9 changed files with 63 additions and 36 deletions
19
.github/workflows/container.yml
vendored
19
.github/workflows/container.yml
vendored
|
@ -10,7 +10,7 @@ env:
|
||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
check:
|
check:
|
||||||
name: chequear typescript
|
name: chequear typescript del sitio
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
steps:
|
steps:
|
||||||
- uses: actions/checkout@v3
|
- uses: actions/checkout@v3
|
||||||
|
@ -29,6 +29,7 @@ jobs:
|
||||||
working-directory: ./sitio
|
working-directory: ./sitio
|
||||||
|
|
||||||
build-and-push-sitio:
|
build-and-push-sitio:
|
||||||
|
name: Compilar contenedor del sitio
|
||||||
needs: check
|
needs: check
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
permissions:
|
permissions:
|
||||||
|
@ -37,26 +38,32 @@ jobs:
|
||||||
steps:
|
steps:
|
||||||
- name: Checkout repository
|
- name: Checkout repository
|
||||||
uses: actions/checkout@v4
|
uses: actions/checkout@v4
|
||||||
|
- name: Set up Docker Buildx
|
||||||
|
uses: docker/setup-buildx-action@v3
|
||||||
- name: Log in to the Container registry
|
- name: Log in to the Container registry
|
||||||
uses: docker/login-action@65b78e6e13532edd9afa3aa52ac7964289d1a9c1
|
uses: docker/login-action@v3
|
||||||
with:
|
with:
|
||||||
registry: ${{ env.REGISTRY }}
|
registry: ${{ env.REGISTRY }}
|
||||||
username: ${{ github.actor }}
|
username: ${{ github.actor }}
|
||||||
password: ${{ secrets.GITHUB_TOKEN }}
|
password: ${{ secrets.GITHUB_TOKEN }}
|
||||||
- name: Extract metadata (tags, labels) for Docker
|
- name: Extract metadata (tags, labels) for Docker
|
||||||
id: meta
|
id: meta
|
||||||
uses: docker/metadata-action@9ec57ed1fcdbf14dcef7dfbe97b2010124a938b7
|
uses: docker/metadata-action@v5
|
||||||
with:
|
with:
|
||||||
images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}/sitio
|
images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}/sitio
|
||||||
- name: Build and push Docker image
|
- name: Build and push Docker image
|
||||||
uses: docker/build-push-action@f2a1d5e99d037542a71f64918e516c093c6f3fc4
|
uses: docker/build-push-action@v5
|
||||||
with:
|
with:
|
||||||
context: .
|
context: .
|
||||||
|
file: Dockerfile
|
||||||
push: true
|
push: true
|
||||||
tags: ${{ steps.meta.outputs.tags }}
|
tags: ${{ steps.meta.outputs.tags }}
|
||||||
labels: ${{ steps.meta.outputs.labels }}
|
labels: ${{ steps.meta.outputs.labels }}
|
||||||
|
cache-from: type=registry,ref=${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}/sitio:buildcache
|
||||||
|
cache-to: type=registry,ref=${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}/sitio:buildcache,mode=max
|
||||||
|
|
||||||
build-and-push-scraper:
|
build-and-push-scraper:
|
||||||
|
name: Compilar contenedor del scraper
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
permissions:
|
permissions:
|
||||||
contents: read
|
contents: read
|
||||||
|
@ -95,5 +102,5 @@ jobs:
|
||||||
push: true
|
push: true
|
||||||
tags: ${{ steps.meta.outputs.tags }}
|
tags: ${{ steps.meta.outputs.tags }}
|
||||||
labels: ${{ steps.meta.outputs.labels }}
|
labels: ${{ steps.meta.outputs.labels }}
|
||||||
cache-from: type=registry,ref=${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:buildcache
|
cache-from: type=registry,ref=${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}/scraper:buildcache
|
||||||
cache-to: type=registry,ref=${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:buildcache,mode=max
|
cache-to: type=registry,ref=${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}/scraper:buildcache,mode=max
|
||||||
|
|
23
Dockerfile
23
Dockerfile
|
@ -1,29 +1,26 @@
|
||||||
FROM docker.io/oven/bun:1-alpine AS base
|
FROM cgr.dev/chainguard/wolfi-base AS base
|
||||||
WORKDIR /usr/src/app
|
WORKDIR /usr/src/app
|
||||||
|
|
||||||
FROM base as build
|
FROM base as build
|
||||||
ENV NODE_ENV=production
|
RUN apk add --no-cache nodejs npm
|
||||||
RUN apk add --no-cache nodejs
|
RUN npm install --global pnpm
|
||||||
COPY . .
|
COPY . .
|
||||||
RUN bun install --frozen-lockfile
|
COPY db-datos/drizzle .
|
||||||
RUN cd sitio && \
|
RUN cd sitio && \
|
||||||
bun run build
|
pnpm install && \
|
||||||
RUN bun build scraper/cli.ts --target=bun --outfile=/tmp/cli.build.js
|
pnpm build
|
||||||
|
|
||||||
FROM cgr.dev/chainguard/wolfi-base
|
FROM base
|
||||||
RUN apk add --no-cache nodejs npm jq bun sqlite
|
ENV NODE_ENV=production
|
||||||
|
RUN apk add --no-cache nodejs npm jq sqlite
|
||||||
|
|
||||||
# Sitio
|
# Sitio
|
||||||
COPY --from=build /usr/src/app/sitio/package.json package.real.json
|
COPY --from=build /usr/src/app/sitio/package.json package.real.json
|
||||||
RUN sh -c 'echo {\"name\":\"sitio\",\"type\":\"module\",\"dependencies\":$(jq .dependencies < package.real.json)} > package.json' && npm install
|
RUN sh -c 'echo {\"name\":\"sitio\",\"type\":\"module\",\"dependencies\":$(jq .dependencies < package.real.json)} > package.json' && npm install
|
||||||
COPY --from=build /usr/src/app/db-datos node_modules/db-datos
|
COPY --from=build /usr/src/app/db-datos node_modules/db-datos
|
||||||
COPY --from=build /usr/src/app/sitio/build .
|
COPY --from=build /usr/src/app/sitio/build .
|
||||||
|
COPY --from=build /usr/src/app/db-datos/drizzle .
|
||||||
|
|
||||||
# Scraper
|
|
||||||
COPY --from=build /tmp/cli.build.js /bin/scraper
|
|
||||||
COPY --from=build /usr/src/app/db-datos/drizzle /bin/drizzle
|
|
||||||
|
|
||||||
ENV NODE_ENV=production
|
|
||||||
ENV DB_PATH=/db/db.db
|
ENV DB_PATH=/db/db.db
|
||||||
EXPOSE 3000
|
EXPOSE 3000
|
||||||
|
|
||||||
|
|
|
@ -2,10 +2,7 @@
|
||||||
import Database from "better-sqlite3";
|
import Database from "better-sqlite3";
|
||||||
import { drizzle } from "drizzle-orm/better-sqlite3";
|
import { drizzle } from "drizzle-orm/better-sqlite3";
|
||||||
import { DB_PATH } from "./drizzle.config.js";
|
import { DB_PATH } from "./drizzle.config.js";
|
||||||
import { migrateDb } from "./migrate.js";
|
|
||||||
import * as schema from "./schema.js";
|
import * as schema from "./schema.js";
|
||||||
|
|
||||||
migrateDb();
|
|
||||||
|
|
||||||
export const sqlite = new Database(DB_PATH);
|
export const sqlite = new Database(DB_PATH);
|
||||||
export const db = drizzle(sqlite, { schema });
|
export const db = drizzle(sqlite, { schema });
|
||||||
|
|
10
scraper-rs/Cargo.lock
generated
10
scraper-rs/Cargo.lock
generated
|
@ -902,6 +902,15 @@ dependencies = [
|
||||||
"unicode-ident",
|
"unicode-ident",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "quick-xml"
|
||||||
|
version = "0.31.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "1004a344b30a54e2ee58d66a71b32d2db2feb0a31f9a2d302bf0536f15de2a33"
|
||||||
|
dependencies = [
|
||||||
|
"memchr",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "quote"
|
name = "quote"
|
||||||
version = "1.0.35"
|
version = "1.0.35"
|
||||||
|
@ -1132,6 +1141,7 @@ dependencies = [
|
||||||
"futures",
|
"futures",
|
||||||
"itertools",
|
"itertools",
|
||||||
"nanoid",
|
"nanoid",
|
||||||
|
"quick-xml",
|
||||||
"rand 0.8.5",
|
"rand 0.8.5",
|
||||||
"reqwest",
|
"reqwest",
|
||||||
"rusqlite",
|
"rusqlite",
|
||||||
|
|
|
@ -14,6 +14,7 @@ deadpool-sqlite = "0.7.0"
|
||||||
futures = "0.3.30"
|
futures = "0.3.30"
|
||||||
itertools = "0.12.0"
|
itertools = "0.12.0"
|
||||||
nanoid = "0.4.0"
|
nanoid = "0.4.0"
|
||||||
|
quick-xml = "0.31.0"
|
||||||
rand = "0.8.5"
|
rand = "0.8.5"
|
||||||
reqwest = { version = "0.11.23", default-features = false, features = [
|
reqwest = { version = "0.11.23", default-features = false, features = [
|
||||||
"rustls-tls",
|
"rustls-tls",
|
||||||
|
|
|
@ -107,14 +107,34 @@ pub fn in_stock_from_meta(dom: &VDom) -> anyhow::Result<bool> {
|
||||||
|
|
||||||
pub fn parse_urls_from_sitemap(sitemap: &str) -> anyhow::Result<Vec<String>> {
|
pub fn parse_urls_from_sitemap(sitemap: &str) -> anyhow::Result<Vec<String>> {
|
||||||
let dom = tl::parse(sitemap, tl::ParserOptions::default())?;
|
let dom = tl::parse(sitemap, tl::ParserOptions::default())?;
|
||||||
Ok(dom
|
dom.query_selector("loc")
|
||||||
.query_selector("loc")
|
|
||||||
.unwrap()
|
.unwrap()
|
||||||
.filter_map(|h| h.get(dom.parser()))
|
.filter_map(|h| h.get(dom.parser()))
|
||||||
.filter_map(|n| n.as_tag())
|
.filter_map(|n| n.as_tag())
|
||||||
.map(|t| t.inner_text(dom.parser()))
|
.map(|t| t.inner_text(dom.parser()))
|
||||||
.map(|s| s.to_string())
|
.map(|s| -> anyhow::Result<String> {
|
||||||
.collect())
|
Ok(quick_xml::escape::unescape(s.as_ref())?.to_string())
|
||||||
|
})
|
||||||
|
.try_collect()
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
use super::*;
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_decode_url() -> anyhow::Result<()> {
|
||||||
|
let links = parse_urls_from_sitemap(
|
||||||
|
r#"
|
||||||
|
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9" xmlns:xhtml="http://www.w3.org/1999/xhtml">
|
||||||
|
<url>
|
||||||
|
<loc>https://www.carrefour.com.ar/postre-danette-mousse-dulce-de-leche-80-g​-684952/p</loc>
|
||||||
|
<lastmod>2024-01-12T10:41:25.962Z</lastmod>
|
||||||
|
</url>"#,
|
||||||
|
)?;
|
||||||
|
assert_eq!(links[0], "https://www.carrefour.com.ar/postre-danette-mousse-dulce-de-leche-80-g\u{200b}-684952/p");
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub async fn get_urls_from_sitemap(sitemaps: Vec<&str>) -> anyhow::Result<Vec<String>> {
|
pub async fn get_urls_from_sitemap(sitemaps: Vec<&str>) -> anyhow::Result<Vec<String>> {
|
||||||
|
|
|
@ -1,10 +1,2 @@
|
||||||
import Database from "better-sqlite3";
|
export { db } from "db-datos/db.js";
|
||||||
import { drizzle } from "drizzle-orm/better-sqlite3";
|
|
||||||
import * as schema from "db-datos/schema.js";
|
|
||||||
import { env } from "$env/dynamic/private";
|
|
||||||
|
|
||||||
const sqlite = new Database(env.DB_PATH ?? "../scraper/sqlite.db");
|
|
||||||
const db = drizzle(sqlite, { schema });
|
|
||||||
|
|
||||||
export { db };
|
|
||||||
export * as schema from "db-datos/schema.js";
|
export * as schema from "db-datos/schema.js";
|
||||||
|
|
|
@ -9,7 +9,9 @@
|
||||||
"skipLibCheck": true,
|
"skipLibCheck": true,
|
||||||
"sourceMap": true,
|
"sourceMap": true,
|
||||||
"strict": true,
|
"strict": true,
|
||||||
"moduleResolution": "bundler"
|
"moduleResolution": "bundler",
|
||||||
|
"maxNodeModuleJsDepth": 1,
|
||||||
|
"declaration": true
|
||||||
}
|
}
|
||||||
// Path aliases are handled by https://kit.svelte.dev/docs/configuration#alias
|
// Path aliases are handled by https://kit.svelte.dev/docs/configuration#alias
|
||||||
//
|
//
|
||||||
|
|
|
@ -11,6 +11,7 @@
|
||||||
"strict": true,
|
"strict": true,
|
||||||
"skipLibCheck": true,
|
"skipLibCheck": true,
|
||||||
"esModuleInterop": true,
|
"esModuleInterop": true,
|
||||||
|
"allowJs": true,
|
||||||
"checkJs": true,
|
"checkJs": true,
|
||||||
"noEmit": true,
|
"noEmit": true,
|
||||||
"forceConsistentCasingInFileNames": true
|
"forceConsistentCasingInFileNames": true
|
||||||
|
|
Loading…
Reference in a new issue