mirror of
https://github.com/catdevnull/preciazo.git
synced 2024-11-22 14:16:19 +00:00
Compare commits
8 commits
26b9f4b17f
...
5e738985f6
Author | SHA1 | Date | |
---|---|---|---|
5e738985f6 | |||
eedeae37ea | |||
a19d1aba65 | |||
12ee9bb592 | |||
adc7cc459f | |||
0207ea2e18 | |||
1108951c79 | |||
c7a578d8cc |
3 changed files with 49 additions and 15 deletions
11
.github/workflows/container.yml
vendored
11
.github/workflows/container.yml
vendored
|
@ -54,7 +54,6 @@ jobs:
|
||||||
labels: ${{ steps.meta.outputs.labels }}
|
labels: ${{ steps.meta.outputs.labels }}
|
||||||
|
|
||||||
build-and-push-scraper:
|
build-and-push-scraper:
|
||||||
needs: check
|
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
permissions:
|
permissions:
|
||||||
contents: read
|
contents: read
|
||||||
|
@ -75,6 +74,16 @@ jobs:
|
||||||
uses: docker/metadata-action@v5
|
uses: docker/metadata-action@v5
|
||||||
with:
|
with:
|
||||||
images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}/scraper
|
images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}/scraper
|
||||||
|
- name: Cache usr/src/app/target
|
||||||
|
uses: actions/cache@v3
|
||||||
|
with:
|
||||||
|
path: usr/src/app/target
|
||||||
|
key: usr/src/app/target-${{ hashFiles('Dockerfile') }}
|
||||||
|
- name: inject usr/src/app/target into docker
|
||||||
|
uses: reproducible-containers/buildkit-cache-dance@v2.1.3
|
||||||
|
with:
|
||||||
|
cache-source: usr/src/app/target
|
||||||
|
cache-target: /usr/src/app/target
|
||||||
- name: Build and push Docker image
|
- name: Build and push Docker image
|
||||||
uses: docker/build-push-action@v5
|
uses: docker/build-push-action@v5
|
||||||
with:
|
with:
|
||||||
|
|
|
@ -6,7 +6,10 @@ FROM base as rs-build
|
||||||
RUN apk add --no-cache rust build-base sqlite-dev
|
RUN apk add --no-cache rust build-base sqlite-dev
|
||||||
|
|
||||||
COPY scraper-rs/ .
|
COPY scraper-rs/ .
|
||||||
RUN cargo install --locked --path .
|
RUN --mount=type=cache,sharing=locked,target=/root/.cargo/git \
|
||||||
|
--mount=type=cache,sharing=locked,target=/root/.cargo/registry \
|
||||||
|
--mount=type=cache,sharing=locked,target=/usr/src/app/target \
|
||||||
|
cargo install --locked --path .
|
||||||
|
|
||||||
FROM base
|
FROM base
|
||||||
RUN apk add --no-cache sqlite sqlite-libs
|
RUN apk add --no-cache sqlite sqlite-libs
|
||||||
|
|
|
@ -21,6 +21,16 @@ enum Supermercado {
|
||||||
Carrefour,
|
Carrefour,
|
||||||
Coto,
|
Coto,
|
||||||
}
|
}
|
||||||
|
impl Supermercado {
|
||||||
|
fn host(self: &Self) -> &'static str {
|
||||||
|
match self {
|
||||||
|
Self::Dia => "diaonline.supermercadosdia.com.ar",
|
||||||
|
Self::Carrefour => "www.carrefour.com.ar",
|
||||||
|
Self::Coto => "www.cotodigital3.com.ar",
|
||||||
|
Self::Jumbo => "www.jumbo.com.ar",
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
#[derive(Parser)] // requires `derive` feature
|
#[derive(Parser)] // requires `derive` feature
|
||||||
enum Args {
|
enum Args {
|
||||||
|
@ -88,8 +98,6 @@ async fn fetch_list(pool: &Pool, links: Vec<String>) -> Counters {
|
||||||
let client = client.clone();
|
let client = client.clone();
|
||||||
tokio::spawn(fetch_and_save(client, url, pool))
|
tokio::spawn(fetch_and_save(client, url, pool))
|
||||||
})
|
})
|
||||||
// https://github.com/rust-lang/rust/issues/89976#issuecomment-1073115246
|
|
||||||
.boxed()
|
|
||||||
.buffer_unordered(n_coroutines)
|
.buffer_unordered(n_coroutines)
|
||||||
.fold(Counters::default(), move |x, y| {
|
.fold(Counters::default(), move |x, y| {
|
||||||
let ret = y.unwrap();
|
let ret = y.unwrap();
|
||||||
|
@ -110,7 +118,11 @@ fn connect_db() -> Pool {
|
||||||
}
|
}
|
||||||
|
|
||||||
fn build_client() -> reqwest::Client {
|
fn build_client() -> reqwest::Client {
|
||||||
reqwest::ClientBuilder::default().build().unwrap()
|
reqwest::ClientBuilder::default()
|
||||||
|
.timeout(Duration::from_secs(60 * 5))
|
||||||
|
.connect_timeout(Duration::from_secs(60))
|
||||||
|
.build()
|
||||||
|
.unwrap()
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Default, Debug)]
|
#[derive(Default, Debug)]
|
||||||
|
@ -128,7 +140,7 @@ async fn fetch_and_save(client: reqwest::Client, url: String, pool: Pool) -> Cou
|
||||||
counters.success += 1;
|
counters.success += 1;
|
||||||
pool.get().await.unwrap().interact(move |conn| conn.execute(
|
pool.get().await.unwrap().interact(move |conn| conn.execute(
|
||||||
"INSERT INTO precios(ean, fetched_at, precio_centavos, in_stock, url, warc_record_id, parser_version, name, image_url) VALUES (?1, ?2, ?3, ?4, ?5, ?6, ?7, ?8, ?9);",
|
"INSERT INTO precios(ean, fetched_at, precio_centavos, in_stock, url, warc_record_id, parser_version, name, image_url) VALUES (?1, ?2, ?3, ?4, ?5, ?6, ?7, ?8, ?9);",
|
||||||
rusqlite::params![
|
rusqlite::params![
|
||||||
res.ean,
|
res.ean,
|
||||||
res.fetched_at,
|
res.fetched_at,
|
||||||
res.precio_centavos,
|
res.precio_centavos,
|
||||||
|
@ -142,8 +154,8 @@ async fn fetch_and_save(client: reqwest::Client, url: String, pool: Pool) -> Cou
|
||||||
)).await.unwrap().unwrap();
|
)).await.unwrap().unwrap();
|
||||||
}
|
}
|
||||||
Err(err) => {
|
Err(err) => {
|
||||||
match err.downcast_ref::<FetchError>() {
|
match err.downcast_ref::<reqwest::Error>() {
|
||||||
Some(FetchError::Http(e)) => match e.status() {
|
Some(e) => match e.status() {
|
||||||
Some(StatusCode::NOT_FOUND) => counters.skipped += 1,
|
Some(StatusCode::NOT_FOUND) => counters.skipped += 1,
|
||||||
_ => counters.errored += 1,
|
_ => counters.errored += 1,
|
||||||
},
|
},
|
||||||
|
@ -158,8 +170,6 @@ async fn fetch_and_save(client: reqwest::Client, url: String, pool: Pool) -> Cou
|
||||||
|
|
||||||
#[derive(Debug, Error)]
|
#[derive(Debug, Error)]
|
||||||
enum FetchError {
|
enum FetchError {
|
||||||
#[error("reqwest error")]
|
|
||||||
Http(#[from] reqwest::Error),
|
|
||||||
#[error("parse error")]
|
#[error("parse error")]
|
||||||
Parse(#[from] SimpleError),
|
Parse(#[from] SimpleError),
|
||||||
#[error("tl error")]
|
#[error("tl error")]
|
||||||
|
@ -191,8 +201,7 @@ async fn fetch_and_parse(
|
||||||
.retry_if(|| do_request(client, &url), retry_if_wasnt_not_found)
|
.retry_if(|| do_request(client, &url), retry_if_wasnt_not_found)
|
||||||
.await?
|
.await?
|
||||||
.text()
|
.text()
|
||||||
.await
|
.await?;
|
||||||
.map_err(FetchError::Http)?;
|
|
||||||
|
|
||||||
let maybe_point = { scrap_url(client, url, &body).await };
|
let maybe_point = { scrap_url(client, url, &body).await };
|
||||||
|
|
||||||
|
@ -291,19 +300,32 @@ impl Auto {
|
||||||
.await;
|
.await;
|
||||||
}
|
}
|
||||||
let links: Vec<String> = {
|
let links: Vec<String> = {
|
||||||
|
let search = format!("%{}%", supermercado.host());
|
||||||
self.pool
|
self.pool
|
||||||
.get()
|
.get()
|
||||||
.await?
|
.await?
|
||||||
.interact(|conn| -> anyhow::Result<Vec<String>> {
|
.interact(move |conn| -> anyhow::Result<Vec<String>> {
|
||||||
Ok(conn
|
Ok(conn
|
||||||
.prepare(r#"SELECT url FROM producto_urls;"#)?
|
.prepare(
|
||||||
.query_map([], |r| r.get::<_, String>(0))?
|
r#"SELECT url FROM producto_urls
|
||||||
|
WHERE url LIKE ?1;"#,
|
||||||
|
)?
|
||||||
|
.query_map(rusqlite::params![search], |r| r.get::<_, String>(0))?
|
||||||
.map(|r| r.unwrap())
|
.map(|r| r.unwrap())
|
||||||
.collect())
|
.collect())
|
||||||
})
|
})
|
||||||
.await
|
.await
|
||||||
.unwrap()?
|
.unwrap()?
|
||||||
};
|
};
|
||||||
|
// {
|
||||||
|
// let debug_path = PathBuf::from("debug/");
|
||||||
|
// tokio::fs::create_dir_all(&debug_path).await.unwrap();
|
||||||
|
// let file_path = debug_path.join(format!("{}.txt", nanoid!()));
|
||||||
|
// tokio::fs::write(&file_path, &links.join("\n"))
|
||||||
|
// .await
|
||||||
|
// .unwrap();
|
||||||
|
// tracing::info!("Lista de {:?}: {:?}", &supermercado, file_path.display());
|
||||||
|
// }
|
||||||
{
|
{
|
||||||
let t0 = now_sec();
|
let t0 = now_sec();
|
||||||
let counters = fetch_list(&self.pool, links).await;
|
let counters = fetch_list(&self.pool, links).await;
|
||||||
|
|
Loading…
Reference in a new issue