Compare commits

..

No commits in common. "5e738985f693ec3d8b11743ce45bcaf42c717486" and "26b9f4b17f1494f3a390e3f0e406b0571fd4310f" have entirely different histories.

3 changed files with 15 additions and 49 deletions

View file

@ -54,6 +54,7 @@ jobs:
labels: ${{ steps.meta.outputs.labels }}
build-and-push-scraper:
needs: check
runs-on: ubuntu-latest
permissions:
contents: read
@ -74,16 +75,6 @@ jobs:
uses: docker/metadata-action@v5
with:
images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}/scraper
- name: Cache usr/src/app/target
uses: actions/cache@v3
with:
path: usr/src/app/target
key: usr/src/app/target-${{ hashFiles('Dockerfile') }}
- name: inject usr/src/app/target into docker
uses: reproducible-containers/buildkit-cache-dance@v2.1.3
with:
cache-source: usr/src/app/target
cache-target: /usr/src/app/target
- name: Build and push Docker image
uses: docker/build-push-action@v5
with:

View file

@ -6,10 +6,7 @@ FROM base as rs-build
RUN apk add --no-cache rust build-base sqlite-dev
COPY scraper-rs/ .
RUN --mount=type=cache,sharing=locked,target=/root/.cargo/git \
--mount=type=cache,sharing=locked,target=/root/.cargo/registry \
--mount=type=cache,sharing=locked,target=/usr/src/app/target \
cargo install --locked --path .
RUN cargo install --locked --path .
FROM base
RUN apk add --no-cache sqlite sqlite-libs

View file

@ -21,16 +21,6 @@ enum Supermercado {
Carrefour,
Coto,
}
impl Supermercado {
fn host(self: &Self) -> &'static str {
match self {
Self::Dia => "diaonline.supermercadosdia.com.ar",
Self::Carrefour => "www.carrefour.com.ar",
Self::Coto => "www.cotodigital3.com.ar",
Self::Jumbo => "www.jumbo.com.ar",
}
}
}
#[derive(Parser)] // requires `derive` feature
enum Args {
@ -98,6 +88,8 @@ async fn fetch_list(pool: &Pool, links: Vec<String>) -> Counters {
let client = client.clone();
tokio::spawn(fetch_and_save(client, url, pool))
})
// https://github.com/rust-lang/rust/issues/89976#issuecomment-1073115246
.boxed()
.buffer_unordered(n_coroutines)
.fold(Counters::default(), move |x, y| {
let ret = y.unwrap();
@ -118,11 +110,7 @@ fn connect_db() -> Pool {
}
fn build_client() -> reqwest::Client {
reqwest::ClientBuilder::default()
.timeout(Duration::from_secs(60 * 5))
.connect_timeout(Duration::from_secs(60))
.build()
.unwrap()
reqwest::ClientBuilder::default().build().unwrap()
}
#[derive(Default, Debug)]
@ -140,7 +128,7 @@ async fn fetch_and_save(client: reqwest::Client, url: String, pool: Pool) -> Cou
counters.success += 1;
pool.get().await.unwrap().interact(move |conn| conn.execute(
"INSERT INTO precios(ean, fetched_at, precio_centavos, in_stock, url, warc_record_id, parser_version, name, image_url) VALUES (?1, ?2, ?3, ?4, ?5, ?6, ?7, ?8, ?9);",
rusqlite::params![
rusqlite::params![
res.ean,
res.fetched_at,
res.precio_centavos,
@ -154,8 +142,8 @@ async fn fetch_and_save(client: reqwest::Client, url: String, pool: Pool) -> Cou
)).await.unwrap().unwrap();
}
Err(err) => {
match err.downcast_ref::<reqwest::Error>() {
Some(e) => match e.status() {
match err.downcast_ref::<FetchError>() {
Some(FetchError::Http(e)) => match e.status() {
Some(StatusCode::NOT_FOUND) => counters.skipped += 1,
_ => counters.errored += 1,
},
@ -170,6 +158,8 @@ async fn fetch_and_save(client: reqwest::Client, url: String, pool: Pool) -> Cou
#[derive(Debug, Error)]
enum FetchError {
#[error("reqwest error")]
Http(#[from] reqwest::Error),
#[error("parse error")]
Parse(#[from] SimpleError),
#[error("tl error")]
@ -201,7 +191,8 @@ async fn fetch_and_parse(
.retry_if(|| do_request(client, &url), retry_if_wasnt_not_found)
.await?
.text()
.await?;
.await
.map_err(FetchError::Http)?;
let maybe_point = { scrap_url(client, url, &body).await };
@ -300,32 +291,19 @@ impl Auto {
.await;
}
let links: Vec<String> = {
let search = format!("%{}%", supermercado.host());
self.pool
.get()
.await?
.interact(move |conn| -> anyhow::Result<Vec<String>> {
.interact(|conn| -> anyhow::Result<Vec<String>> {
Ok(conn
.prepare(
r#"SELECT url FROM producto_urls
WHERE url LIKE ?1;"#,
)?
.query_map(rusqlite::params![search], |r| r.get::<_, String>(0))?
.prepare(r#"SELECT url FROM producto_urls;"#)?
.query_map([], |r| r.get::<_, String>(0))?
.map(|r| r.unwrap())
.collect())
})
.await
.unwrap()?
};
// {
// let debug_path = PathBuf::from("debug/");
// tokio::fs::create_dir_all(&debug_path).await.unwrap();
// let file_path = debug_path.join(format!("{}.txt", nanoid!()));
// tokio::fs::write(&file_path, &links.join("\n"))
// .await
// .unwrap();
// tracing::info!("Lista de {:?}: {:?}", &supermercado, file_path.display());
// }
{
let t0 = now_sec();
let counters = fetch_list(&self.pool, links).await;