mirror of
https://github.com/catdevnull/preciazo.git
synced 2024-11-26 19:46:19 +00:00
Compare commits
No commits in common. "5e738985f693ec3d8b11743ce45bcaf42c717486" and "26b9f4b17f1494f3a390e3f0e406b0571fd4310f" have entirely different histories.
5e738985f6
...
26b9f4b17f
3 changed files with 15 additions and 49 deletions
11
.github/workflows/container.yml
vendored
11
.github/workflows/container.yml
vendored
|
@ -54,6 +54,7 @@ jobs:
|
||||||
labels: ${{ steps.meta.outputs.labels }}
|
labels: ${{ steps.meta.outputs.labels }}
|
||||||
|
|
||||||
build-and-push-scraper:
|
build-and-push-scraper:
|
||||||
|
needs: check
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
permissions:
|
permissions:
|
||||||
contents: read
|
contents: read
|
||||||
|
@ -74,16 +75,6 @@ jobs:
|
||||||
uses: docker/metadata-action@v5
|
uses: docker/metadata-action@v5
|
||||||
with:
|
with:
|
||||||
images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}/scraper
|
images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}/scraper
|
||||||
- name: Cache usr/src/app/target
|
|
||||||
uses: actions/cache@v3
|
|
||||||
with:
|
|
||||||
path: usr/src/app/target
|
|
||||||
key: usr/src/app/target-${{ hashFiles('Dockerfile') }}
|
|
||||||
- name: inject usr/src/app/target into docker
|
|
||||||
uses: reproducible-containers/buildkit-cache-dance@v2.1.3
|
|
||||||
with:
|
|
||||||
cache-source: usr/src/app/target
|
|
||||||
cache-target: /usr/src/app/target
|
|
||||||
- name: Build and push Docker image
|
- name: Build and push Docker image
|
||||||
uses: docker/build-push-action@v5
|
uses: docker/build-push-action@v5
|
||||||
with:
|
with:
|
||||||
|
|
|
@ -6,10 +6,7 @@ FROM base as rs-build
|
||||||
RUN apk add --no-cache rust build-base sqlite-dev
|
RUN apk add --no-cache rust build-base sqlite-dev
|
||||||
|
|
||||||
COPY scraper-rs/ .
|
COPY scraper-rs/ .
|
||||||
RUN --mount=type=cache,sharing=locked,target=/root/.cargo/git \
|
RUN cargo install --locked --path .
|
||||||
--mount=type=cache,sharing=locked,target=/root/.cargo/registry \
|
|
||||||
--mount=type=cache,sharing=locked,target=/usr/src/app/target \
|
|
||||||
cargo install --locked --path .
|
|
||||||
|
|
||||||
FROM base
|
FROM base
|
||||||
RUN apk add --no-cache sqlite sqlite-libs
|
RUN apk add --no-cache sqlite sqlite-libs
|
||||||
|
|
|
@ -21,16 +21,6 @@ enum Supermercado {
|
||||||
Carrefour,
|
Carrefour,
|
||||||
Coto,
|
Coto,
|
||||||
}
|
}
|
||||||
impl Supermercado {
|
|
||||||
fn host(self: &Self) -> &'static str {
|
|
||||||
match self {
|
|
||||||
Self::Dia => "diaonline.supermercadosdia.com.ar",
|
|
||||||
Self::Carrefour => "www.carrefour.com.ar",
|
|
||||||
Self::Coto => "www.cotodigital3.com.ar",
|
|
||||||
Self::Jumbo => "www.jumbo.com.ar",
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#[derive(Parser)] // requires `derive` feature
|
#[derive(Parser)] // requires `derive` feature
|
||||||
enum Args {
|
enum Args {
|
||||||
|
@ -98,6 +88,8 @@ async fn fetch_list(pool: &Pool, links: Vec<String>) -> Counters {
|
||||||
let client = client.clone();
|
let client = client.clone();
|
||||||
tokio::spawn(fetch_and_save(client, url, pool))
|
tokio::spawn(fetch_and_save(client, url, pool))
|
||||||
})
|
})
|
||||||
|
// https://github.com/rust-lang/rust/issues/89976#issuecomment-1073115246
|
||||||
|
.boxed()
|
||||||
.buffer_unordered(n_coroutines)
|
.buffer_unordered(n_coroutines)
|
||||||
.fold(Counters::default(), move |x, y| {
|
.fold(Counters::default(), move |x, y| {
|
||||||
let ret = y.unwrap();
|
let ret = y.unwrap();
|
||||||
|
@ -118,11 +110,7 @@ fn connect_db() -> Pool {
|
||||||
}
|
}
|
||||||
|
|
||||||
fn build_client() -> reqwest::Client {
|
fn build_client() -> reqwest::Client {
|
||||||
reqwest::ClientBuilder::default()
|
reqwest::ClientBuilder::default().build().unwrap()
|
||||||
.timeout(Duration::from_secs(60 * 5))
|
|
||||||
.connect_timeout(Duration::from_secs(60))
|
|
||||||
.build()
|
|
||||||
.unwrap()
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Default, Debug)]
|
#[derive(Default, Debug)]
|
||||||
|
@ -140,7 +128,7 @@ async fn fetch_and_save(client: reqwest::Client, url: String, pool: Pool) -> Cou
|
||||||
counters.success += 1;
|
counters.success += 1;
|
||||||
pool.get().await.unwrap().interact(move |conn| conn.execute(
|
pool.get().await.unwrap().interact(move |conn| conn.execute(
|
||||||
"INSERT INTO precios(ean, fetched_at, precio_centavos, in_stock, url, warc_record_id, parser_version, name, image_url) VALUES (?1, ?2, ?3, ?4, ?5, ?6, ?7, ?8, ?9);",
|
"INSERT INTO precios(ean, fetched_at, precio_centavos, in_stock, url, warc_record_id, parser_version, name, image_url) VALUES (?1, ?2, ?3, ?4, ?5, ?6, ?7, ?8, ?9);",
|
||||||
rusqlite::params![
|
rusqlite::params![
|
||||||
res.ean,
|
res.ean,
|
||||||
res.fetched_at,
|
res.fetched_at,
|
||||||
res.precio_centavos,
|
res.precio_centavos,
|
||||||
|
@ -154,8 +142,8 @@ async fn fetch_and_save(client: reqwest::Client, url: String, pool: Pool) -> Cou
|
||||||
)).await.unwrap().unwrap();
|
)).await.unwrap().unwrap();
|
||||||
}
|
}
|
||||||
Err(err) => {
|
Err(err) => {
|
||||||
match err.downcast_ref::<reqwest::Error>() {
|
match err.downcast_ref::<FetchError>() {
|
||||||
Some(e) => match e.status() {
|
Some(FetchError::Http(e)) => match e.status() {
|
||||||
Some(StatusCode::NOT_FOUND) => counters.skipped += 1,
|
Some(StatusCode::NOT_FOUND) => counters.skipped += 1,
|
||||||
_ => counters.errored += 1,
|
_ => counters.errored += 1,
|
||||||
},
|
},
|
||||||
|
@ -170,6 +158,8 @@ async fn fetch_and_save(client: reqwest::Client, url: String, pool: Pool) -> Cou
|
||||||
|
|
||||||
#[derive(Debug, Error)]
|
#[derive(Debug, Error)]
|
||||||
enum FetchError {
|
enum FetchError {
|
||||||
|
#[error("reqwest error")]
|
||||||
|
Http(#[from] reqwest::Error),
|
||||||
#[error("parse error")]
|
#[error("parse error")]
|
||||||
Parse(#[from] SimpleError),
|
Parse(#[from] SimpleError),
|
||||||
#[error("tl error")]
|
#[error("tl error")]
|
||||||
|
@ -201,7 +191,8 @@ async fn fetch_and_parse(
|
||||||
.retry_if(|| do_request(client, &url), retry_if_wasnt_not_found)
|
.retry_if(|| do_request(client, &url), retry_if_wasnt_not_found)
|
||||||
.await?
|
.await?
|
||||||
.text()
|
.text()
|
||||||
.await?;
|
.await
|
||||||
|
.map_err(FetchError::Http)?;
|
||||||
|
|
||||||
let maybe_point = { scrap_url(client, url, &body).await };
|
let maybe_point = { scrap_url(client, url, &body).await };
|
||||||
|
|
||||||
|
@ -300,32 +291,19 @@ impl Auto {
|
||||||
.await;
|
.await;
|
||||||
}
|
}
|
||||||
let links: Vec<String> = {
|
let links: Vec<String> = {
|
||||||
let search = format!("%{}%", supermercado.host());
|
|
||||||
self.pool
|
self.pool
|
||||||
.get()
|
.get()
|
||||||
.await?
|
.await?
|
||||||
.interact(move |conn| -> anyhow::Result<Vec<String>> {
|
.interact(|conn| -> anyhow::Result<Vec<String>> {
|
||||||
Ok(conn
|
Ok(conn
|
||||||
.prepare(
|
.prepare(r#"SELECT url FROM producto_urls;"#)?
|
||||||
r#"SELECT url FROM producto_urls
|
.query_map([], |r| r.get::<_, String>(0))?
|
||||||
WHERE url LIKE ?1;"#,
|
|
||||||
)?
|
|
||||||
.query_map(rusqlite::params![search], |r| r.get::<_, String>(0))?
|
|
||||||
.map(|r| r.unwrap())
|
.map(|r| r.unwrap())
|
||||||
.collect())
|
.collect())
|
||||||
})
|
})
|
||||||
.await
|
.await
|
||||||
.unwrap()?
|
.unwrap()?
|
||||||
};
|
};
|
||||||
// {
|
|
||||||
// let debug_path = PathBuf::from("debug/");
|
|
||||||
// tokio::fs::create_dir_all(&debug_path).await.unwrap();
|
|
||||||
// let file_path = debug_path.join(format!("{}.txt", nanoid!()));
|
|
||||||
// tokio::fs::write(&file_path, &links.join("\n"))
|
|
||||||
// .await
|
|
||||||
// .unwrap();
|
|
||||||
// tracing::info!("Lista de {:?}: {:?}", &supermercado, file_path.display());
|
|
||||||
// }
|
|
||||||
{
|
{
|
||||||
let t0 = now_sec();
|
let t0 = now_sec();
|
||||||
let counters = fetch_list(&self.pool, links).await;
|
let counters = fetch_list(&self.pool, links).await;
|
||||||
|
|
Loading…
Reference in a new issue