This commit is contained in:
Cat /dev/Nulo 2024-01-01 22:31:57 -03:00
parent 405502877c
commit 3cf723cc3d
3 changed files with 617 additions and 341 deletions

684
warcificator/Cargo.lock generated
View file

@ -18,10 +18,17 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe"
[[package]]
name = "adler32"
version = "1.2.0"
name = "ahash"
version = "0.8.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "aae1277d39aeec15cb388266ecc24b11c80469deae6067e17a1a7aa9e5c1f234"
checksum = "77c3a9648d43b9cd48db467b3f87fdd6e146bcc88ab0180006cef2179fe11d01"
dependencies = [
"cfg-if",
"getrandom",
"once_cell",
"version_check",
"zerocopy",
]
[[package]]
name = "alloc-no-stdlib"
@ -39,25 +46,10 @@ dependencies = [
]
[[package]]
name = "android-tzdata"
version = "0.1.1"
name = "allocator-api2"
version = "0.2.16"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e999941b234f3131b00bc13c22d06e8c5ff726d1b6318ac7eb276997bbb4fef0"
[[package]]
name = "android_system_properties"
version = "0.1.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "819e7219dbd41043ac279b19830f2efc897156490d7fd6ea916720117ee66311"
dependencies = [
"libc",
]
[[package]]
name = "arrayvec"
version = "0.5.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "23b62fc65de8e4e7f52534fb52b0f3ed04746ae267519eef2a83941e8085068b"
checksum = "0942ffc6dcaadf03badf6e6a2d0228460359d5e34b57ccdc720b7382dfbd5ec5"
[[package]]
name = "async-channel"
@ -119,6 +111,12 @@ version = "1.3.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a"
[[package]]
name = "bitflags"
version = "2.4.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "327762f6e5a765692301e5bb513e0d9fef63be86bbc14528052b1cd3e6f03e07"
[[package]]
name = "brotli"
version = "3.4.0"
@ -146,6 +144,12 @@ version = "3.14.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7f30e7476521f6f8af1a1c4c0b8cc94f0bee37d91763d0ca2665f299b6cd8aec"
[[package]]
name = "byteorder"
version = "1.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b"
[[package]]
name = "bytes"
version = "1.5.0"
@ -167,20 +171,6 @@ version = "1.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"
[[package]]
name = "chrono"
version = "0.4.31"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7f2c685bad3eb3d45a01354cedb7d5faa66194d1d58ba6e267a8de788f79db38"
dependencies = [
"android-tzdata",
"iana-time-zone",
"js-sys",
"num-traits",
"wasm-bindgen",
"windows-targets 0.48.5",
]
[[package]]
name = "concurrent-queue"
version = "2.4.0"
@ -224,6 +214,61 @@ dependencies = [
"cfg-if",
]
[[package]]
name = "cssparser"
version = "0.31.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5b3df4f93e5fbbe73ec01ec8d3f68bba73107993a5b1e7519273c32db9b0d5be"
dependencies = [
"cssparser-macros",
"dtoa-short",
"itoa",
"phf 0.11.2",
"smallvec",
]
[[package]]
name = "cssparser-macros"
version = "0.6.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "13b588ba4ac1a99f7f2964d24b3d896ddc6bf847ee3855dbd4366f058cfcd331"
dependencies = [
"quote",
"syn 2.0.43",
]
[[package]]
name = "derive_more"
version = "0.99.17"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4fb810d30a7c1953f91334de7244731fc3f3c10d7fe163338a35b9f640960321"
dependencies = [
"proc-macro2",
"quote",
"syn 1.0.109",
]
[[package]]
name = "dtoa"
version = "1.0.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "dcbb2bf8e87535c23f7a8a321e364ce21462d0ff10cb6407820e8e96dfff6653"
[[package]]
name = "dtoa-short"
version = "0.3.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "dbaceec3c6e4211c79e7b1800fb9680527106beb2f9c51904a3210c03a448c74"
dependencies = [
"dtoa",
]
[[package]]
name = "ego-tree"
version = "0.6.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3a68a4904193147e0a8dec3314640e6db742afd5f6e634f428a6af230d9b3591"
[[package]]
name = "encoding_rs"
version = "0.8.33"
@ -260,6 +305,18 @@ dependencies = [
"pin-project-lite",
]
[[package]]
name = "fallible-iterator"
version = "0.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2acce4a10f12dc2fb14a218589d4f1f62ef011b2d0cc4b3cb1bba8e94da14649"
[[package]]
name = "fallible-streaming-iterator"
version = "0.1.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7360491ce676a36bf9bb3c56c1aa791658183a54d2744120f27285738d90465a"
[[package]]
name = "flate2"
version = "1.0.28"
@ -285,6 +342,16 @@ dependencies = [
"percent-encoding",
]
[[package]]
name = "futf"
version = "0.1.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "df420e2e84819663797d1ec6544b13c5be84629e7bb00dc960d6917db2987843"
dependencies = [
"mac",
"new_debug_unreachable",
]
[[package]]
name = "futures-channel"
version = "0.3.30"
@ -324,6 +391,24 @@ dependencies = [
"pin-utils",
]
[[package]]
name = "fxhash"
version = "0.2.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c31b6d751ae2c7f11320402d34e41349dd1016f8d5d45e48c4312bc8625af50c"
dependencies = [
"byteorder",
]
[[package]]
name = "getopts"
version = "0.2.21"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "14dbbfd5c71d70241ecf9e6f13737f7b5ce823821063188d7e46c41d371eebd5"
dependencies = [
"unicode-width",
]
[[package]]
name = "getrandom"
version = "0.2.11"
@ -365,6 +450,19 @@ name = "hashbrown"
version = "0.14.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "290f1a1d9242c78d09ce40a5e87e7554ee637af1351968159f4952f028f75604"
dependencies = [
"ahash",
"allocator-api2",
]
[[package]]
name = "hashlink"
version = "0.8.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e8094feaf31ff591f651a2664fb9cfd92bba7a60ce3197265e9482ebe753c8f7"
dependencies = [
"hashbrown",
]
[[package]]
name = "hermit-abi"
@ -372,6 +470,20 @@ version = "0.3.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d77f7ec81a6d05a3abb01ab6eb7590f6083d08449fe5a1c8b1e620283546ccb7"
[[package]]
name = "html5ever"
version = "0.26.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "bea68cab48b8459f17cf1c944c67ddc572d272d9f2b274140f223ecb1da4a3b7"
dependencies = [
"log",
"mac",
"markup5ever",
"proc-macro2",
"quote",
"syn 1.0.109",
]
[[package]]
name = "http"
version = "0.2.11"
@ -444,29 +556,6 @@ dependencies = [
"tokio-rustls",
]
[[package]]
name = "iana-time-zone"
version = "0.1.59"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b6a67363e2aa4443928ce15e57ebae94fd8949958fd1223c4cfc0cd473ad7539"
dependencies = [
"android_system_properties",
"core-foundation-sys",
"iana-time-zone-haiku",
"js-sys",
"wasm-bindgen",
"windows-core",
]
[[package]]
name = "iana-time-zone-haiku"
version = "0.1.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f31827a206f56af32e590ba56d5d2d085f558508192593743f16b2306495269f"
dependencies = [
"cc",
]
[[package]]
name = "idna"
version = "0.5.0"
@ -508,19 +597,6 @@ dependencies = [
"wasm-bindgen",
]
[[package]]
name = "lexical-core"
version = "0.7.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6607c62aa161d23d17a9072cc5da0be67cdfc89d3afb1e8d9c842bebc2525ffe"
dependencies = [
"arrayvec",
"bitflags",
"cfg-if",
"ryu",
"static_assertions",
]
[[package]]
name = "libc"
version = "0.2.151"
@ -528,23 +604,13 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "302d7ab3130588088d277783b1e2d2e10c9e9e4a16dd9050e6ec93fb3e7048f4"
[[package]]
name = "libflate"
version = "1.4.0"
name = "libsqlite3-sys"
version = "0.27.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5ff4ae71b685bbad2f2f391fe74f6b7659a34871c08b210fdc039e43bee07d18"
checksum = "cf4e226dcd58b4be396f7bd3c20da8fdee2911400705297ba7d2d7cc2c30f716"
dependencies = [
"adler32",
"crc32fast",
"libflate_lz77",
]
[[package]]
name = "libflate_lz77"
version = "1.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a52d3a8bfc85f250440e4424db7d857e241a3aebbbe301f3eb606ab15c39acbf"
dependencies = [
"rle-decode-fast",
"pkg-config",
"vcpkg",
]
[[package]]
@ -563,6 +629,26 @@ version = "0.4.20"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b5e6163cb8c49088c2c36f57875e58ccd8c87c7427f7fbd50ea6710b2f3f2e8f"
[[package]]
name = "mac"
version = "0.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c41e0c4fef86961ac6d6f8a82609f55f31b05e4fce149ac5710e439df7619ba4"
[[package]]
name = "markup5ever"
version = "0.11.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7a2629bb1404f3d34c2e921f21fd34ba00b206124c81f65c50b43b6aaefeb016"
dependencies = [
"log",
"phf 0.10.1",
"phf_codegen",
"string_cache",
"string_cache_codegen",
"tendril",
]
[[package]]
name = "memchr"
version = "2.7.1"
@ -596,24 +682,10 @@ dependencies = [
]
[[package]]
name = "nom"
version = "5.1.3"
name = "new_debug_unreachable"
version = "1.0.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "08959a387a676302eebf4ddbcbc611da04285579f76f88ee0506c63b1a61dd4b"
dependencies = [
"lexical-core",
"memchr",
"version_check",
]
[[package]]
name = "num-traits"
version = "0.2.17"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "39e3200413f237f41ab11ad6d161bc7239c84dcb631773ccd7de3dfe4b5c267c"
dependencies = [
"autocfg",
]
checksum = "e4a24736216ec316047a1fc4252e27dabb04218aa4a3f37c6e7ddbf1f9782b54"
[[package]]
name = "num_cpus"
@ -666,7 +738,7 @@ dependencies = [
"libc",
"redox_syscall",
"smallvec",
"windows-targets 0.48.5",
"windows-targets",
]
[[package]]
@ -675,6 +747,86 @@ version = "2.3.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e3148f5046208a5d56bcfc03053e3ca6334e51da8dfb19b6cdc8b306fae3283e"
[[package]]
name = "phf"
version = "0.10.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "fabbf1ead8a5bcbc20f5f8b939ee3f5b0f6f281b6ad3468b84656b658b455259"
dependencies = [
"phf_shared 0.10.0",
]
[[package]]
name = "phf"
version = "0.11.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ade2d8b8f33c7333b51bcf0428d37e217e9f32192ae4772156f65063b8ce03dc"
dependencies = [
"phf_macros",
"phf_shared 0.11.2",
]
[[package]]
name = "phf_codegen"
version = "0.10.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4fb1c3a8bc4dd4e5cfce29b44ffc14bedd2ee294559a294e2a4d4c9e9a6a13cd"
dependencies = [
"phf_generator 0.10.0",
"phf_shared 0.10.0",
]
[[package]]
name = "phf_generator"
version = "0.10.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5d5285893bb5eb82e6aaf5d59ee909a06a16737a8970984dd7746ba9283498d6"
dependencies = [
"phf_shared 0.10.0",
"rand",
]
[[package]]
name = "phf_generator"
version = "0.11.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "48e4cc64c2ad9ebe670cb8fd69dd50ae301650392e81c05f9bfcb2d5bdbc24b0"
dependencies = [
"phf_shared 0.11.2",
"rand",
]
[[package]]
name = "phf_macros"
version = "0.11.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3444646e286606587e49f3bcf1679b8cef1dc2c5ecc29ddacaffc305180d464b"
dependencies = [
"phf_generator 0.11.2",
"phf_shared 0.11.2",
"proc-macro2",
"quote",
"syn 2.0.43",
]
[[package]]
name = "phf_shared"
version = "0.10.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b6796ad771acdc0123d2a88dc428b5e38ef24456743ddb1744ed628f9815c096"
dependencies = [
"siphasher",
]
[[package]]
name = "phf_shared"
version = "0.11.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "90fcb95eef784c2ac79119d1dd819e162b5da872ce6f3c3abe1e8ca1c082f72b"
dependencies = [
"siphasher",
]
[[package]]
name = "pin-project-lite"
version = "0.2.13"
@ -687,6 +839,24 @@ version = "0.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184"
[[package]]
name = "pkg-config"
version = "0.3.28"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "69d3587f8a9e599cc7ec2c00e331f71c4e69a5f9a4b8a6efd5b07466b9736f9a"
[[package]]
name = "ppv-lite86"
version = "0.2.17"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5b40af805b3121feab8a3c29f04d8ad262fa8e0561883e7653e024ae4479e6de"
[[package]]
name = "precomputed-hash"
version = "0.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "925383efa346730478fb4838dbe9137d2a47675ad789c546d150a6e1dd4ab31c"
[[package]]
name = "proc-macro2"
version = "1.0.71"
@ -705,13 +875,43 @@ dependencies = [
"proc-macro2",
]
[[package]]
name = "rand"
version = "0.8.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404"
dependencies = [
"libc",
"rand_chacha",
"rand_core",
]
[[package]]
name = "rand_chacha"
version = "0.3.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e6c10a63a0fa32252be49d21e7709d4d4baf8d231c2dbce1eaa8141b9b127d88"
dependencies = [
"ppv-lite86",
"rand_core",
]
[[package]]
name = "rand_core"
version = "0.6.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c"
dependencies = [
"getrandom",
]
[[package]]
name = "redox_syscall"
version = "0.4.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4722d768eff46b75989dd134e5c353f0d6296e5aaa3132e776cbdb56be7731aa"
dependencies = [
"bitflags",
"bitflags 1.3.2",
]
[[package]]
@ -771,10 +971,18 @@ dependencies = [
]
[[package]]
name = "rle-decode-fast"
version = "1.0.3"
name = "rusqlite"
version = "0.30.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3582f63211428f83597b51b2ddb88e2a91a9d52d12831f9d08f5e624e8977422"
checksum = "a78046161564f5e7cd9008aff3b2990b3850dc8e0349119b98e8f251e099f24d"
dependencies = [
"bitflags 2.4.1",
"fallible-iterator",
"fallible-streaming-iterator",
"hashlink",
"libsqlite3-sys",
"smallvec",
]
[[package]]
name = "rustc-demangle"
@ -825,6 +1033,22 @@ version = "1.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49"
[[package]]
name = "scraper"
version = "0.18.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "585480e3719b311b78a573db1c9d9c4c1f8010c2dee4cc59c2efe58ea4dbc3e1"
dependencies = [
"ahash",
"cssparser",
"ego-tree",
"getopts",
"html5ever",
"once_cell",
"selectors",
"tendril",
]
[[package]]
name = "sct"
version = "0.7.1"
@ -835,6 +1059,25 @@ dependencies = [
"untrusted",
]
[[package]]
name = "selectors"
version = "0.25.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4eb30575f3638fc8f6815f448d50cb1a2e255b0897985c8c59f4d37b72a07b06"
dependencies = [
"bitflags 2.4.1",
"cssparser",
"derive_more",
"fxhash",
"log",
"new_debug_unreachable",
"phf 0.10.1",
"phf_codegen",
"precomputed-hash",
"servo_arc",
"smallvec",
]
[[package]]
name = "serde"
version = "1.0.193"
@ -852,14 +1095,14 @@ checksum = "43576ca501357b9b071ac53cdc7da8ef0cbd9493d8df094cd821777ea6e894d3"
dependencies = [
"proc-macro2",
"quote",
"syn",
"syn 2.0.43",
]
[[package]]
name = "serde_json"
version = "1.0.108"
version = "1.0.109"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3d1c7e3eac408d115102c4c24ad393e0821bb3a5df4d506a80f85f7a742a526b"
checksum = "cb0652c533506ad7a2e353cce269330d6afd8bdfb6d75e0ace5b35aacbd7b9e9"
dependencies = [
"itoa",
"ryu",
@ -878,6 +1121,15 @@ dependencies = [
"serde",
]
[[package]]
name = "servo_arc"
version = "0.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d036d71a959e00c77a63538b90a6c2390969f9772b096ea837205c6bd0491a44"
dependencies = [
"stable_deref_trait",
]
[[package]]
name = "signal-hook-registry"
version = "1.4.1"
@ -887,6 +1139,12 @@ dependencies = [
"libc",
]
[[package]]
name = "siphasher"
version = "0.3.11"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "38b58827f4464d87d377d175e90bf58eb00fd8716ff0a62f80356b5e61555d0d"
[[package]]
name = "slab"
version = "0.4.9"
@ -919,10 +1177,47 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6980e8d7511241f8acf4aebddbb1ff938df5eebe98691418c4468d0b72a96a67"
[[package]]
name = "static_assertions"
version = "1.1.0"
name = "stable_deref_trait"
version = "1.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a2eb9349b6444b326872e140eb1cf5e7c522154d69e7a0ffb0fb81c06b37543f"
checksum = "a8f112729512f8e442d81f95a8a7ddf2b7c6b8a1a6f509a95864142b30cab2d3"
[[package]]
name = "string_cache"
version = "0.8.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f91138e76242f575eb1d3b38b4f1362f10d3a43f47d182a5b359af488a02293b"
dependencies = [
"new_debug_unreachable",
"once_cell",
"parking_lot",
"phf_shared 0.10.0",
"precomputed-hash",
"serde",
]
[[package]]
name = "string_cache_codegen"
version = "0.5.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6bb30289b722be4ff74a408c3cc27edeaad656e06cb1fe8fa9231fa59c728988"
dependencies = [
"phf_generator 0.10.0",
"phf_shared 0.10.0",
"proc-macro2",
"quote",
]
[[package]]
name = "syn"
version = "1.0.109"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "72b64191b275b66ffe2469e8af2c1cfe3bafa67b529ead792a6d0160888b4237"
dependencies = [
"proc-macro2",
"quote",
"unicode-ident",
]
[[package]]
name = "syn"
@ -941,7 +1236,7 @@ version = "0.5.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ba3a3adc5c275d719af8cb4272ea1c4a6d668a777f37e115f6d11ddbc1c8e0e7"
dependencies = [
"bitflags",
"bitflags 1.3.2",
"core-foundation",
"system-configuration-sys",
]
@ -956,6 +1251,17 @@ dependencies = [
"libc",
]
[[package]]
name = "tendril"
version = "0.4.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d24a120c5fc464a3458240ee02c299ebcb9d67b5249c8848b09d639dca8d7bb0"
dependencies = [
"futf",
"mac",
"utf-8",
]
[[package]]
name = "tinyvec"
version = "1.6.0"
@ -998,7 +1304,7 @@ checksum = "5b8a1e28f2deaa14e508979454cb3a223b10b938b45af148bc0986de36f1923b"
dependencies = [
"proc-macro2",
"quote",
"syn",
"syn 2.0.43",
]
[[package]]
@ -1077,6 +1383,12 @@ dependencies = [
"tinyvec",
]
[[package]]
name = "unicode-width"
version = "0.1.11"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e51733f11c9c4f72aa0c160008246859e340b00807569a0da0e7a1079b27ba85"
[[package]]
name = "untrusted"
version = "0.9.0"
@ -1095,13 +1407,16 @@ dependencies = [
]
[[package]]
name = "uuid"
version = "0.8.2"
name = "utf-8"
version = "0.7.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "bc5cf98d8186244414c848017f0e2676b3fcb46807f6668a97dfe67359a3c4b7"
dependencies = [
"getrandom",
]
checksum = "09cc8ee72d2a9becf2f2febe0205bbed8fc6615b7cb429ad062dc7b7ddd036a9"
[[package]]
name = "vcpkg"
version = "0.2.15"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "accd4ea62f7bb7a82fe23066fb0957d48ef677f6eeb8215f372f52e48bb32426"
[[package]]
name = "version_check"
@ -1118,28 +1433,17 @@ dependencies = [
"try-lock",
]
[[package]]
name = "warc"
version = "0.3.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e44585619c04f9a4bcfdfbbbefff9e7c4e277a693424162b30c88aaf3abd785e"
dependencies = [
"chrono",
"libflate",
"nom",
"url",
"uuid",
]
[[package]]
name = "warcificator"
version = "0.1.0"
dependencies = [
"async-channel",
"http",
"reqwest",
"rusqlite",
"scraper",
"serde",
"serde_json",
"tokio",
"warc",
]
[[package]]
@ -1169,7 +1473,7 @@ dependencies = [
"once_cell",
"proc-macro2",
"quote",
"syn",
"syn 2.0.43",
"wasm-bindgen-shared",
]
@ -1203,7 +1507,7 @@ checksum = "f0eb82fcb7930ae6219a7ecfd55b217f5f0893484b7a13022ebb2b2bf20b5283"
dependencies = [
"proc-macro2",
"quote",
"syn",
"syn 2.0.43",
"wasm-bindgen-backend",
"wasm-bindgen-shared",
]
@ -1230,22 +1534,13 @@ version = "0.25.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1778a42e8b3b90bff8d0f5032bf22250792889a5cdc752aa0020c84abe3aaf10"
[[package]]
name = "windows-core"
version = "0.52.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "33ab640c8d7e35bf8ba19b884ba838ceb4fba93a4e8c65a9059d08afcfc683d9"
dependencies = [
"windows-targets 0.52.0",
]
[[package]]
name = "windows-sys"
version = "0.48.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "677d2418bec65e3338edb076e806bc1ec15693c5d0104683f2efe857f61056a9"
dependencies = [
"windows-targets 0.48.5",
"windows-targets",
]
[[package]]
@ -1254,28 +1549,13 @@ version = "0.48.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9a2fa6e2155d7247be68c096456083145c183cbbbc2764150dda45a87197940c"
dependencies = [
"windows_aarch64_gnullvm 0.48.5",
"windows_aarch64_msvc 0.48.5",
"windows_i686_gnu 0.48.5",
"windows_i686_msvc 0.48.5",
"windows_x86_64_gnu 0.48.5",
"windows_x86_64_gnullvm 0.48.5",
"windows_x86_64_msvc 0.48.5",
]
[[package]]
name = "windows-targets"
version = "0.52.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8a18201040b24831fbb9e4eb208f8892e1f50a37feb53cc7ff887feb8f50e7cd"
dependencies = [
"windows_aarch64_gnullvm 0.52.0",
"windows_aarch64_msvc 0.52.0",
"windows_i686_gnu 0.52.0",
"windows_i686_msvc 0.52.0",
"windows_x86_64_gnu 0.52.0",
"windows_x86_64_gnullvm 0.52.0",
"windows_x86_64_msvc 0.52.0",
"windows_aarch64_gnullvm",
"windows_aarch64_msvc",
"windows_i686_gnu",
"windows_i686_msvc",
"windows_x86_64_gnu",
"windows_x86_64_gnullvm",
"windows_x86_64_msvc",
]
[[package]]
@ -1284,84 +1564,42 @@ version = "0.48.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2b38e32f0abccf9987a4e3079dfb67dcd799fb61361e53e2882c3cbaf0d905d8"
[[package]]
name = "windows_aarch64_gnullvm"
version = "0.52.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "cb7764e35d4db8a7921e09562a0304bf2f93e0a51bfccee0bd0bb0b666b015ea"
[[package]]
name = "windows_aarch64_msvc"
version = "0.48.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "dc35310971f3b2dbbf3f0690a219f40e2d9afcf64f9ab7cc1be722937c26b4bc"
[[package]]
name = "windows_aarch64_msvc"
version = "0.52.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "bbaa0368d4f1d2aaefc55b6fcfee13f41544ddf36801e793edbbfd7d7df075ef"
[[package]]
name = "windows_i686_gnu"
version = "0.48.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a75915e7def60c94dcef72200b9a8e58e5091744960da64ec734a6c6e9b3743e"
[[package]]
name = "windows_i686_gnu"
version = "0.52.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a28637cb1fa3560a16915793afb20081aba2c92ee8af57b4d5f28e4b3e7df313"
[[package]]
name = "windows_i686_msvc"
version = "0.48.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8f55c233f70c4b27f66c523580f78f1004e8b5a8b659e05a4eb49d4166cca406"
[[package]]
name = "windows_i686_msvc"
version = "0.52.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ffe5e8e31046ce6230cc7215707b816e339ff4d4d67c65dffa206fd0f7aa7b9a"
[[package]]
name = "windows_x86_64_gnu"
version = "0.48.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "53d40abd2583d23e4718fddf1ebec84dbff8381c07cae67ff7768bbf19c6718e"
[[package]]
name = "windows_x86_64_gnu"
version = "0.52.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3d6fa32db2bc4a2f5abeacf2b69f7992cd09dca97498da74a151a3132c26befd"
[[package]]
name = "windows_x86_64_gnullvm"
version = "0.48.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0b7b52767868a23d5bab768e390dc5f5c55825b6d30b86c844ff2dc7414044cc"
[[package]]
name = "windows_x86_64_gnullvm"
version = "0.52.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1a657e1e9d3f514745a572a6846d3c7aa7dbe1658c056ed9c3344c4109a6949e"
[[package]]
name = "windows_x86_64_msvc"
version = "0.48.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ed94fce61571a4006852b7389a063ab983c02eb1bb37b47f8272ce92d06d9538"
[[package]]
name = "windows_x86_64_msvc"
version = "0.52.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "dff9641d1cd4be8d1a070daf9e3773c5f67e78b4d9d42263020c057706765c04"
[[package]]
name = "winreg"
version = "0.50.0"
@ -1371,3 +1609,23 @@ dependencies = [
"cfg-if",
"windows-sys",
]
[[package]]
name = "zerocopy"
version = "0.7.32"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "74d4d3961e53fa4c9a25a8637fc2bfaf2595b3d3ae34875568a5cf64787716be"
dependencies = [
"zerocopy-derive",
]
[[package]]
name = "zerocopy-derive"
version = "0.7.32"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9ce1b18ccd8e73a9321186f97e46f9f04b778851177567b1975109d26a08d2a6"
dependencies = [
"proc-macro2",
"quote",
"syn 2.0.43",
]

View file

@ -7,11 +7,13 @@ edition = "2021"
[dependencies]
async-channel = "2.1.1"
http = "0.2.11"
reqwest = { version = "0.11.23", default-features = false, features = [
"rustls-tls",
"gzip",
"brotli",
] }
rusqlite = "0.30.0"
scraper = "0.18.1"
serde = { version = "1.0.193", features = ["derive"] }
serde_json = "1.0.109"
tokio = { version = "1.35.1", features = ["full"] }
warc = "0.3.1"

View file

@ -1,24 +1,30 @@
use async_channel::{Receiver, Sender};
use rusqlite::Connection;
use scraper::{Element, Html, Selector};
use std::{
env::args,
fs,
net::SocketAddr,
process::{Command, Stdio},
time::{SystemTime, UNIX_EPOCH},
};
use tokio::io::{stderr, AsyncWriteExt};
use warc::{RecordBuilder, WarcHeader, WarcWriter};
struct FullExchange {
socket_addr: Option<SocketAddr>,
request: http::Request<&'static str>,
response: http::Response<Vec<u8>>,
#[derive(Debug)]
struct PrecioPoint {
ean: String,
// unix
fetched_at: u64,
precio_centavos: Option<u64>,
in_stock: Option<bool>,
url: String,
parser_version: u16,
name: Option<String>,
image_url: Option<String>,
}
#[tokio::main]
async fn main() {
let mut args = args().skip(1);
let links_list_path = args.next().unwrap();
let output_zstd_path = args.next().unwrap();
let links_str = fs::read_to_string(links_list_path).unwrap();
let links = links_str
.split("\n")
@ -29,7 +35,7 @@ async fn main() {
let handle = {
let (sender, receiver) = async_channel::bounded::<String>(1);
let (res_sender, res_receiver) = async_channel::unbounded::<FullExchange>();
let (res_sender, res_receiver) = async_channel::unbounded::<PrecioPoint>();
let mut handles = Vec::new();
for _ in 1..16 {
@ -38,7 +44,7 @@ async fn main() {
handles.push(tokio::spawn(worker(rx, tx)));
}
let warc_writer_handle = tokio::spawn(warc_writer(res_receiver, output_zstd_path));
let db_writer_handle = tokio::spawn(db_writer(res_receiver));
for link in links {
sender.send_blocking(link).unwrap();
@ -49,22 +55,22 @@ async fn main() {
handle.await.unwrap();
}
warc_writer_handle
db_writer_handle
};
handle.await.unwrap();
}
async fn worker(rx: Receiver<String>, tx: Sender<FullExchange>) {
async fn worker(rx: Receiver<String>, tx: Sender<PrecioPoint>) {
let client = reqwest::ClientBuilder::default().build().unwrap();
while let Ok(url) = rx.recv().await {
let res = fetch(&client, url.clone()).await;
let res = fetch_and_parse(&client, url.clone()).await;
match res {
Ok(ex) => {
tx.send(ex).await.unwrap();
}
Err(err) => {
stderr()
.write_all(format!("Failed to fetch {}: {:#?}", url.as_str(), err).as_bytes())
.write_all(format!("Failed to fetch {}: {:#?}\n", url.as_str(), err).as_bytes())
.await
.unwrap();
}
@ -72,128 +78,138 @@ async fn worker(rx: Receiver<String>, tx: Sender<FullExchange>) {
}
}
async fn fetch(client: &reqwest::Client, url: String) -> Result<FullExchange, reqwest::Error> {
let request = client.get(url).build().unwrap();
let mut http_request_builder = http::Request::builder()
.method(request.method())
.uri(request.url().as_str());
for (key, val) in request.headers() {
http_request_builder = http_request_builder.header(key, val);
}
let response = client.execute(request).await?;
#[derive(Debug)]
enum FetchError {
HttpError(reqwest::Error),
NoPriceMetaEl,
NoMetaContent,
NotANumber,
NoStockMetaEl,
NoValidStockMeta,
NoSeedState,
NoProductInSeedState,
NoProductSkuInSeedState,
}
let ip_address = response.remote_addr();
async fn fetch_and_parse(client: &reqwest::Client, url: String) -> Result<PrecioPoint, FetchError> {
let request = client.get(url.as_str()).build().unwrap();
let response = client
.execute(request)
.await
.map_err(|e| FetchError::HttpError(e))?;
let body = response
.text()
.await
.map_err(|e| FetchError::HttpError(e))?;
let http_request = {
http_request_builder
.version(response.version())
.body("")
.unwrap()
let html = Html::parse_document(&body);
let point = parse_carrefour(url, html)?;
Ok(point)
}
fn parse_carrefour(url: String, html: Html) -> Result<PrecioPoint, FetchError> {
let meta_price_sel = Selector::parse("meta[property=\"product:price:amount\"]").unwrap();
let precio_centavos = match html.select(&meta_price_sel).next() {
Some(el) => match el.attr("content") {
Some(attr) => match attr.parse::<f64>() {
Ok(f) => Ok((f * 100.0) as u64),
Err(_) => Err(FetchError::NotANumber),
},
None => Err(FetchError::NoMetaContent),
},
None => Err(FetchError::NoPriceMetaEl),
}?;
let meta_stock_el = Selector::parse("meta[property=\"product:availability\"]").unwrap();
let in_stock = match html.select(&meta_stock_el).next() {
Some(el) => match el.attr("content") {
Some(attr) => match attr {
"oos" => Ok(Some(false)),
"instock" => Ok(Some(true)),
_ => Err(FetchError::NoValidStockMeta),
},
None => Err(FetchError::NoMetaContent),
},
None => Err(FetchError::NoStockMetaEl),
}?;
let ean = {
let state = parse_script_json(&html, "__STATE__").ok_or(FetchError::NoSeedState)?;
let seed_state = &state.as_object().ok_or(FetchError::NoSeedState)?;
let (_, product_json) = seed_state
.into_iter()
.find(|(key, val)| {
key.starts_with("Product:")
&& val.as_object().map_or(false, |val| {
val.get("__typename")
.map_or(false, |typename| typename == "Product")
})
})
.ok_or(FetchError::NoProductInSeedState)?;
let cache_id = product_json
.get("cacheId")
.ok_or(FetchError::NoProductInSeedState)?;
let (_, product_sku_json) = seed_state
.into_iter()
.filter_map(|(key, val)| val.as_object().map_or(None, |o| Some((key, o))))
.find(|(key, val)| {
key.starts_with(&format!("Product:{}", cache_id))
&& val
.get("__typename")
.map_or(false, |typename| typename == "SKU")
})
.ok_or(FetchError::NoProductSkuInSeedState)?;
product_sku_json
.get("ean")
.ok_or(FetchError::NoProductSkuInSeedState)?
.as_str()
.ok_or(FetchError::NoProductSkuInSeedState)?
.to_string()
};
let http_response = {
let mut http_response_builder = http::Response::<()>::builder()
.status(response.status())
.version(response.version());
for (key, val) in response.headers() {
http_response_builder = http_response_builder.header(key, val);
}
let body = response.bytes().await?;
http_response_builder.body(body.to_vec()).unwrap()
};
Ok(FullExchange {
socket_addr: ip_address,
request: http_request,
response: http_response,
Ok(PrecioPoint {
ean: ean,
fetched_at: now_sec(),
in_stock: in_stock,
name: None,
image_url: None,
parser_version: 5,
precio_centavos: Some(precio_centavos),
url: url,
})
}
async fn warc_writer(rx: Receiver<FullExchange>, output_zstd_path: String) {
let zstd_proc = Command::new("zstd")
.args(&["-T0", "-15", "--long", "-o", &output_zstd_path])
.stdin(Stdio::piped())
.stderr(Stdio::null())
.stdout(Stdio::null())
.spawn()
.unwrap();
let mut writer = WarcWriter::new(zstd_proc.stdin.unwrap());
writer
.write(
&RecordBuilder::default()
.version("1.0".to_owned())
.warc_type(warc::RecordType::WarcInfo)
.header(WarcHeader::ContentType, "application/warc-fields")
.body(format!("software: preciazo-warcificator/0.0.0\nformat: WARC file version 1.0\nconformsTo: http://www.archive.org/documents/WarcFileFormat-1.0.html").into())
.build()
.unwrap(),
)
.unwrap();
while let Ok(res) = rx.recv().await {
let uri = res.request.uri().to_string();
let req_record = {
let mut builder = RecordBuilder::default()
.version("1.0".to_owned())
.warc_type(warc::RecordType::Request)
.header(WarcHeader::TargetURI, uri.clone())
.header(WarcHeader::ContentType, "application/http;msgtype=request")
.header(
WarcHeader::Unknown("X-Warcificator-Lying".to_string()),
"the request contains other headers not included here",
);
if let Some(addr) = res.socket_addr {
builder = builder.header(WarcHeader::IPAddress, addr.ip().to_string());
}
builder
.body(format_http11_request(res.request).into_bytes())
.build()
.unwrap()
};
writer.write(&req_record).unwrap();
writer
.write(&{
let mut builder = RecordBuilder::default()
.version("1.0".to_owned())
.warc_type(warc::RecordType::Response)
.header(WarcHeader::TargetURI, uri)
.header(WarcHeader::ConcurrentTo, req_record.warc_id())
.header(WarcHeader::ContentType, "application/http;msgtype=response");
if let Some(addr) = res.socket_addr {
builder = builder.header(WarcHeader::IPAddress, addr.ip().to_string());
}
builder
.body(format_http11_response(res.response))
.build()
.unwrap()
})
.unwrap();
fn parse_script_json(html: &Html, varname: &str) -> Option<serde_json::Value> {
let template_sel = Selector::parse(&format!(
"template[data-type=\"json\"][data-varname=\"{}\"]",
varname
))
.unwrap();
match html.select(&template_sel).next() {
Some(value) => match value.first_element_child() {
Some(script) => match serde_json::from_str(&script.inner_html()) {
Ok(val) => val,
Err(_) => None,
},
None => None,
},
None => None,
}
}
fn format_http11_request(req: http::Request<&'static str>) -> String {
let start_line = format!("{} {} HTTP/1.1", req.method().as_str(), req.uri().path());
let headers_str = req
.headers()
.iter()
.map(|(key, val)| format!("{}: {}\r\n", key, val.to_str().unwrap()))
.collect::<String>();
[start_line.as_str(), headers_str.as_str(), req.body()].join("\r\n")
fn now_sec() -> u64 {
let start = SystemTime::now();
let since_the_epoch = start
.duration_since(UNIX_EPOCH)
.expect("Time went backwards");
since_the_epoch.as_secs()
}
fn format_http11_response(res: http::Response<Vec<u8>>) -> Vec<u8> {
let start_line = format!(
"HTTP/1.1 {} {}",
res.status().as_str(),
res.status().canonical_reason().unwrap_or("")
);
let headers_str = res
.headers()
.iter()
.map(|(key, val)| format!("{}: {}\r\n", key, val.to_str().unwrap()))
.collect::<String>();
let crlf: &[u8] = &[13, 10];
[start_line.as_bytes(), headers_str.as_bytes(), res.body()].join(crlf)
async fn db_writer(rx: Receiver<PrecioPoint>) {
let conn = Connection::open("../scraper/sqlite.db").unwrap();
// let mut stmt = conn.prepare("SELECT id, name, data FROM person")?;
while let Ok(res) = rx.recv().await {
println!("{:#?}", res)
}
}