mirror of
https://github.com/catdevnull/preciazo.git
synced 2024-11-22 22:26:19 +00:00
Compare commits
8 commits
26b9f4b17f
...
5e738985f6
Author | SHA1 | Date | |
---|---|---|---|
5e738985f6 | |||
eedeae37ea | |||
a19d1aba65 | |||
12ee9bb592 | |||
adc7cc459f | |||
0207ea2e18 | |||
1108951c79 | |||
c7a578d8cc |
3 changed files with 49 additions and 15 deletions
11
.github/workflows/container.yml
vendored
11
.github/workflows/container.yml
vendored
|
@ -54,7 +54,6 @@ jobs:
|
|||
labels: ${{ steps.meta.outputs.labels }}
|
||||
|
||||
build-and-push-scraper:
|
||||
needs: check
|
||||
runs-on: ubuntu-latest
|
||||
permissions:
|
||||
contents: read
|
||||
|
@ -75,6 +74,16 @@ jobs:
|
|||
uses: docker/metadata-action@v5
|
||||
with:
|
||||
images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}/scraper
|
||||
- name: Cache usr/src/app/target
|
||||
uses: actions/cache@v3
|
||||
with:
|
||||
path: usr/src/app/target
|
||||
key: usr/src/app/target-${{ hashFiles('Dockerfile') }}
|
||||
- name: inject usr/src/app/target into docker
|
||||
uses: reproducible-containers/buildkit-cache-dance@v2.1.3
|
||||
with:
|
||||
cache-source: usr/src/app/target
|
||||
cache-target: /usr/src/app/target
|
||||
- name: Build and push Docker image
|
||||
uses: docker/build-push-action@v5
|
||||
with:
|
||||
|
|
|
@ -6,7 +6,10 @@ FROM base as rs-build
|
|||
RUN apk add --no-cache rust build-base sqlite-dev
|
||||
|
||||
COPY scraper-rs/ .
|
||||
RUN cargo install --locked --path .
|
||||
RUN --mount=type=cache,sharing=locked,target=/root/.cargo/git \
|
||||
--mount=type=cache,sharing=locked,target=/root/.cargo/registry \
|
||||
--mount=type=cache,sharing=locked,target=/usr/src/app/target \
|
||||
cargo install --locked --path .
|
||||
|
||||
FROM base
|
||||
RUN apk add --no-cache sqlite sqlite-libs
|
||||
|
|
|
@ -21,6 +21,16 @@ enum Supermercado {
|
|||
Carrefour,
|
||||
Coto,
|
||||
}
|
||||
impl Supermercado {
|
||||
fn host(self: &Self) -> &'static str {
|
||||
match self {
|
||||
Self::Dia => "diaonline.supermercadosdia.com.ar",
|
||||
Self::Carrefour => "www.carrefour.com.ar",
|
||||
Self::Coto => "www.cotodigital3.com.ar",
|
||||
Self::Jumbo => "www.jumbo.com.ar",
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Parser)] // requires `derive` feature
|
||||
enum Args {
|
||||
|
@ -88,8 +98,6 @@ async fn fetch_list(pool: &Pool, links: Vec<String>) -> Counters {
|
|||
let client = client.clone();
|
||||
tokio::spawn(fetch_and_save(client, url, pool))
|
||||
})
|
||||
// https://github.com/rust-lang/rust/issues/89976#issuecomment-1073115246
|
||||
.boxed()
|
||||
.buffer_unordered(n_coroutines)
|
||||
.fold(Counters::default(), move |x, y| {
|
||||
let ret = y.unwrap();
|
||||
|
@ -110,7 +118,11 @@ fn connect_db() -> Pool {
|
|||
}
|
||||
|
||||
fn build_client() -> reqwest::Client {
|
||||
reqwest::ClientBuilder::default().build().unwrap()
|
||||
reqwest::ClientBuilder::default()
|
||||
.timeout(Duration::from_secs(60 * 5))
|
||||
.connect_timeout(Duration::from_secs(60))
|
||||
.build()
|
||||
.unwrap()
|
||||
}
|
||||
|
||||
#[derive(Default, Debug)]
|
||||
|
@ -128,7 +140,7 @@ async fn fetch_and_save(client: reqwest::Client, url: String, pool: Pool) -> Cou
|
|||
counters.success += 1;
|
||||
pool.get().await.unwrap().interact(move |conn| conn.execute(
|
||||
"INSERT INTO precios(ean, fetched_at, precio_centavos, in_stock, url, warc_record_id, parser_version, name, image_url) VALUES (?1, ?2, ?3, ?4, ?5, ?6, ?7, ?8, ?9);",
|
||||
rusqlite::params![
|
||||
rusqlite::params![
|
||||
res.ean,
|
||||
res.fetched_at,
|
||||
res.precio_centavos,
|
||||
|
@ -142,8 +154,8 @@ async fn fetch_and_save(client: reqwest::Client, url: String, pool: Pool) -> Cou
|
|||
)).await.unwrap().unwrap();
|
||||
}
|
||||
Err(err) => {
|
||||
match err.downcast_ref::<FetchError>() {
|
||||
Some(FetchError::Http(e)) => match e.status() {
|
||||
match err.downcast_ref::<reqwest::Error>() {
|
||||
Some(e) => match e.status() {
|
||||
Some(StatusCode::NOT_FOUND) => counters.skipped += 1,
|
||||
_ => counters.errored += 1,
|
||||
},
|
||||
|
@ -158,8 +170,6 @@ async fn fetch_and_save(client: reqwest::Client, url: String, pool: Pool) -> Cou
|
|||
|
||||
#[derive(Debug, Error)]
|
||||
enum FetchError {
|
||||
#[error("reqwest error")]
|
||||
Http(#[from] reqwest::Error),
|
||||
#[error("parse error")]
|
||||
Parse(#[from] SimpleError),
|
||||
#[error("tl error")]
|
||||
|
@ -191,8 +201,7 @@ async fn fetch_and_parse(
|
|||
.retry_if(|| do_request(client, &url), retry_if_wasnt_not_found)
|
||||
.await?
|
||||
.text()
|
||||
.await
|
||||
.map_err(FetchError::Http)?;
|
||||
.await?;
|
||||
|
||||
let maybe_point = { scrap_url(client, url, &body).await };
|
||||
|
||||
|
@ -291,19 +300,32 @@ impl Auto {
|
|||
.await;
|
||||
}
|
||||
let links: Vec<String> = {
|
||||
let search = format!("%{}%", supermercado.host());
|
||||
self.pool
|
||||
.get()
|
||||
.await?
|
||||
.interact(|conn| -> anyhow::Result<Vec<String>> {
|
||||
.interact(move |conn| -> anyhow::Result<Vec<String>> {
|
||||
Ok(conn
|
||||
.prepare(r#"SELECT url FROM producto_urls;"#)?
|
||||
.query_map([], |r| r.get::<_, String>(0))?
|
||||
.prepare(
|
||||
r#"SELECT url FROM producto_urls
|
||||
WHERE url LIKE ?1;"#,
|
||||
)?
|
||||
.query_map(rusqlite::params![search], |r| r.get::<_, String>(0))?
|
||||
.map(|r| r.unwrap())
|
||||
.collect())
|
||||
})
|
||||
.await
|
||||
.unwrap()?
|
||||
};
|
||||
// {
|
||||
// let debug_path = PathBuf::from("debug/");
|
||||
// tokio::fs::create_dir_all(&debug_path).await.unwrap();
|
||||
// let file_path = debug_path.join(format!("{}.txt", nanoid!()));
|
||||
// tokio::fs::write(&file_path, &links.join("\n"))
|
||||
// .await
|
||||
// .unwrap();
|
||||
// tracing::info!("Lista de {:?}: {:?}", &supermercado, file_path.display());
|
||||
// }
|
||||
{
|
||||
let t0 = now_sec();
|
||||
let counters = fetch_list(&self.pool, links).await;
|
||||
|
|
Loading…
Reference in a new issue