Compare commits

..

8 commits

Author SHA1 Message Date
5e738985f6 ci: scraper ya no depende del ts 2024-01-12 19:11:06 -03:00
eedeae37ea solo cachear target 2024-01-12 19:10:29 -03:00
a19d1aba65 arreglar contador skipped 2024-01-12 19:06:35 -03:00
12ee9bb592 ci: cache 2024-01-12 18:52:59 -03:00
adc7cc459f timeout 2024-01-12 18:45:00 -03:00
0207ea2e18 docker: cachear rust 2024-01-12 18:11:33 -03:00
1108951c79 solo pedir urls del supermercado siendo fetcheado 2024-01-12 18:09:30 -03:00
c7a578d8cc no boxear 2024-01-12 18:00:21 -03:00
3 changed files with 49 additions and 15 deletions

View file

@ -54,7 +54,6 @@ jobs:
labels: ${{ steps.meta.outputs.labels }} labels: ${{ steps.meta.outputs.labels }}
build-and-push-scraper: build-and-push-scraper:
needs: check
runs-on: ubuntu-latest runs-on: ubuntu-latest
permissions: permissions:
contents: read contents: read
@ -75,6 +74,16 @@ jobs:
uses: docker/metadata-action@v5 uses: docker/metadata-action@v5
with: with:
images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}/scraper images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}/scraper
- name: Cache usr/src/app/target
uses: actions/cache@v3
with:
path: usr/src/app/target
key: usr/src/app/target-${{ hashFiles('Dockerfile') }}
- name: inject usr/src/app/target into docker
uses: reproducible-containers/buildkit-cache-dance@v2.1.3
with:
cache-source: usr/src/app/target
cache-target: /usr/src/app/target
- name: Build and push Docker image - name: Build and push Docker image
uses: docker/build-push-action@v5 uses: docker/build-push-action@v5
with: with:

View file

@ -6,7 +6,10 @@ FROM base as rs-build
RUN apk add --no-cache rust build-base sqlite-dev RUN apk add --no-cache rust build-base sqlite-dev
COPY scraper-rs/ . COPY scraper-rs/ .
RUN cargo install --locked --path . RUN --mount=type=cache,sharing=locked,target=/root/.cargo/git \
--mount=type=cache,sharing=locked,target=/root/.cargo/registry \
--mount=type=cache,sharing=locked,target=/usr/src/app/target \
cargo install --locked --path .
FROM base FROM base
RUN apk add --no-cache sqlite sqlite-libs RUN apk add --no-cache sqlite sqlite-libs

View file

@ -21,6 +21,16 @@ enum Supermercado {
Carrefour, Carrefour,
Coto, Coto,
} }
impl Supermercado {
fn host(self: &Self) -> &'static str {
match self {
Self::Dia => "diaonline.supermercadosdia.com.ar",
Self::Carrefour => "www.carrefour.com.ar",
Self::Coto => "www.cotodigital3.com.ar",
Self::Jumbo => "www.jumbo.com.ar",
}
}
}
#[derive(Parser)] // requires `derive` feature #[derive(Parser)] // requires `derive` feature
enum Args { enum Args {
@ -88,8 +98,6 @@ async fn fetch_list(pool: &Pool, links: Vec<String>) -> Counters {
let client = client.clone(); let client = client.clone();
tokio::spawn(fetch_and_save(client, url, pool)) tokio::spawn(fetch_and_save(client, url, pool))
}) })
// https://github.com/rust-lang/rust/issues/89976#issuecomment-1073115246
.boxed()
.buffer_unordered(n_coroutines) .buffer_unordered(n_coroutines)
.fold(Counters::default(), move |x, y| { .fold(Counters::default(), move |x, y| {
let ret = y.unwrap(); let ret = y.unwrap();
@ -110,7 +118,11 @@ fn connect_db() -> Pool {
} }
fn build_client() -> reqwest::Client { fn build_client() -> reqwest::Client {
reqwest::ClientBuilder::default().build().unwrap() reqwest::ClientBuilder::default()
.timeout(Duration::from_secs(60 * 5))
.connect_timeout(Duration::from_secs(60))
.build()
.unwrap()
} }
#[derive(Default, Debug)] #[derive(Default, Debug)]
@ -128,7 +140,7 @@ async fn fetch_and_save(client: reqwest::Client, url: String, pool: Pool) -> Cou
counters.success += 1; counters.success += 1;
pool.get().await.unwrap().interact(move |conn| conn.execute( pool.get().await.unwrap().interact(move |conn| conn.execute(
"INSERT INTO precios(ean, fetched_at, precio_centavos, in_stock, url, warc_record_id, parser_version, name, image_url) VALUES (?1, ?2, ?3, ?4, ?5, ?6, ?7, ?8, ?9);", "INSERT INTO precios(ean, fetched_at, precio_centavos, in_stock, url, warc_record_id, parser_version, name, image_url) VALUES (?1, ?2, ?3, ?4, ?5, ?6, ?7, ?8, ?9);",
rusqlite::params![ rusqlite::params![
res.ean, res.ean,
res.fetched_at, res.fetched_at,
res.precio_centavos, res.precio_centavos,
@ -142,8 +154,8 @@ async fn fetch_and_save(client: reqwest::Client, url: String, pool: Pool) -> Cou
)).await.unwrap().unwrap(); )).await.unwrap().unwrap();
} }
Err(err) => { Err(err) => {
match err.downcast_ref::<FetchError>() { match err.downcast_ref::<reqwest::Error>() {
Some(FetchError::Http(e)) => match e.status() { Some(e) => match e.status() {
Some(StatusCode::NOT_FOUND) => counters.skipped += 1, Some(StatusCode::NOT_FOUND) => counters.skipped += 1,
_ => counters.errored += 1, _ => counters.errored += 1,
}, },
@ -158,8 +170,6 @@ async fn fetch_and_save(client: reqwest::Client, url: String, pool: Pool) -> Cou
#[derive(Debug, Error)] #[derive(Debug, Error)]
enum FetchError { enum FetchError {
#[error("reqwest error")]
Http(#[from] reqwest::Error),
#[error("parse error")] #[error("parse error")]
Parse(#[from] SimpleError), Parse(#[from] SimpleError),
#[error("tl error")] #[error("tl error")]
@ -191,8 +201,7 @@ async fn fetch_and_parse(
.retry_if(|| do_request(client, &url), retry_if_wasnt_not_found) .retry_if(|| do_request(client, &url), retry_if_wasnt_not_found)
.await? .await?
.text() .text()
.await .await?;
.map_err(FetchError::Http)?;
let maybe_point = { scrap_url(client, url, &body).await }; let maybe_point = { scrap_url(client, url, &body).await };
@ -291,19 +300,32 @@ impl Auto {
.await; .await;
} }
let links: Vec<String> = { let links: Vec<String> = {
let search = format!("%{}%", supermercado.host());
self.pool self.pool
.get() .get()
.await? .await?
.interact(|conn| -> anyhow::Result<Vec<String>> { .interact(move |conn| -> anyhow::Result<Vec<String>> {
Ok(conn Ok(conn
.prepare(r#"SELECT url FROM producto_urls;"#)? .prepare(
.query_map([], |r| r.get::<_, String>(0))? r#"SELECT url FROM producto_urls
WHERE url LIKE ?1;"#,
)?
.query_map(rusqlite::params![search], |r| r.get::<_, String>(0))?
.map(|r| r.unwrap()) .map(|r| r.unwrap())
.collect()) .collect())
}) })
.await .await
.unwrap()? .unwrap()?
}; };
// {
// let debug_path = PathBuf::from("debug/");
// tokio::fs::create_dir_all(&debug_path).await.unwrap();
// let file_path = debug_path.join(format!("{}.txt", nanoid!()));
// tokio::fs::write(&file_path, &links.join("\n"))
// .await
// .unwrap();
// tracing::info!("Lista de {:?}: {:?}", &supermercado, file_path.display());
// }
{ {
let t0 = now_sec(); let t0 = now_sec();
let counters = fetch_list(&self.pool, links).await; let counters = fetch_list(&self.pool, links).await;