mirror of
https://github.com/catdevnull/preciazo.git
synced 2024-11-29 13:06:19 +00:00
WIP: fast downloader
This commit is contained in:
parent
cd6bbbdbe8
commit
d820bcc457
4 changed files with 1517 additions and 1 deletions
|
@ -8,7 +8,8 @@
|
|||
"ghcr.io/shyim/devcontainers-features/bun:0": {},
|
||||
"ghcr.io/devcontainers/features/git-lfs:1": {},
|
||||
"ghcr.io/devcontainers/features/node:1": {},
|
||||
"ghcr.io/swift-server-community/swift-devcontainer-features/sqlite:1": {}
|
||||
"ghcr.io/swift-server-community/swift-devcontainer-features/sqlite:1": {},
|
||||
"ghcr.io/devcontainers/features/rust:1": {}
|
||||
},
|
||||
|
||||
// Use 'forwardPorts' to make a list of ports inside the container available locally.
|
||||
|
|
1373
warcificator/Cargo.lock
generated
Normal file
1373
warcificator/Cargo.lock
generated
Normal file
File diff suppressed because it is too large
Load diff
17
warcificator/Cargo.toml
Normal file
17
warcificator/Cargo.toml
Normal file
|
@ -0,0 +1,17 @@
|
|||
[package]
|
||||
name = "warcificator"
|
||||
version = "0.1.0"
|
||||
edition = "2021"
|
||||
|
||||
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
|
||||
|
||||
[dependencies]
|
||||
async-channel = "2.1.1"
|
||||
http = "0.2.11"
|
||||
reqwest = { version = "0.11.23", default-features = false, features = [
|
||||
"rustls-tls",
|
||||
"gzip",
|
||||
"brotli",
|
||||
] }
|
||||
tokio = { version = "1.35.1", features = ["full"] }
|
||||
warc = "0.3.1"
|
125
warcificator/src/main.rs
Normal file
125
warcificator/src/main.rs
Normal file
|
@ -0,0 +1,125 @@
|
|||
use async_channel::{Receiver, Sender};
|
||||
use std::{env::args, fs, io::stdout, net::SocketAddr};
|
||||
use warc::{RecordBuilder, WarcHeader, WarcWriter};
|
||||
|
||||
struct FullExchange {
|
||||
ip_address: SocketAddr,
|
||||
request: http::Request<&'static str>,
|
||||
response: http::Response<String>,
|
||||
}
|
||||
|
||||
#[tokio::main]
|
||||
async fn main() {
|
||||
let links_list_path = args().skip(1).next().unwrap();
|
||||
let links_str = fs::read_to_string(links_list_path).unwrap();
|
||||
let links = links_str
|
||||
.split("\n")
|
||||
.map(|s| s.trim())
|
||||
.filter(|s| s.len() > 0)
|
||||
.map(|s| s.to_owned())
|
||||
.collect::<Vec<_>>();
|
||||
let handle = {
|
||||
let (sender, receiver) = async_channel::bounded::<String>(1);
|
||||
let (res_sender, res_receiver) = async_channel::unbounded::<FullExchange>();
|
||||
|
||||
let mut handles = Vec::new();
|
||||
for _ in 1..16 {
|
||||
let rx = receiver.clone();
|
||||
let tx = res_sender.clone();
|
||||
handles.push(tokio::spawn(worker(rx, tx)));
|
||||
}
|
||||
|
||||
let warc_writer_handle = tokio::spawn(warc_writer(res_receiver));
|
||||
|
||||
for link in links {
|
||||
sender.send_blocking(link).unwrap();
|
||||
}
|
||||
sender.close();
|
||||
|
||||
for handle in handles {
|
||||
handle.await.unwrap();
|
||||
}
|
||||
|
||||
warc_writer_handle
|
||||
};
|
||||
handle.await.unwrap();
|
||||
}
|
||||
|
||||
async fn worker(rx: Receiver<String>, tx: Sender<FullExchange>) {
|
||||
let client = reqwest::ClientBuilder::default().build().unwrap();
|
||||
while let Ok(url) = rx.recv().await {
|
||||
let request = client.get(url).build().unwrap();
|
||||
let mut http_request_builder = http::Request::builder()
|
||||
.method(request.method())
|
||||
.uri(request.url().as_str());
|
||||
for (key, val) in request.headers() {
|
||||
http_request_builder = http_request_builder.header(key, val);
|
||||
}
|
||||
let response = client.execute(request).await.unwrap();
|
||||
|
||||
let ip_address = response.remote_addr().unwrap();
|
||||
|
||||
let http_request = {
|
||||
http_request_builder
|
||||
.version(response.version())
|
||||
.body("")
|
||||
.unwrap()
|
||||
};
|
||||
|
||||
let http_response = {
|
||||
let mut http_response_builder = http::Response::<()>::builder()
|
||||
.status(response.status())
|
||||
.version(response.version());
|
||||
for (key, val) in response.headers() {
|
||||
http_response_builder = http_response_builder.header(key, val);
|
||||
}
|
||||
let body = response.text().await.unwrap();
|
||||
http_response_builder.body(body).unwrap()
|
||||
};
|
||||
|
||||
tx.send(FullExchange {
|
||||
ip_address: ip_address,
|
||||
request: http_request,
|
||||
response: http_response,
|
||||
})
|
||||
.await;
|
||||
}
|
||||
}
|
||||
|
||||
async fn warc_writer(rx: Receiver<FullExchange>) {
|
||||
let mut writer = WarcWriter::new(stdout());
|
||||
let warc_fields = format!("software: preciazo-warcificator/0.0.0\nformat: WARC file version 1.0\nconformsTo: http://www.archive.org/documents/WarcFileFormat-1.0.html");
|
||||
writer
|
||||
.write(
|
||||
&RecordBuilder::default()
|
||||
.version("1.0".to_owned())
|
||||
.warc_type(warc::RecordType::WarcInfo)
|
||||
.header(WarcHeader::ContentType, "application/warc-fields")
|
||||
.body(warc_fields.into())
|
||||
.build()
|
||||
.unwrap(),
|
||||
)
|
||||
.unwrap();
|
||||
while let Ok(res) = rx.recv().await {
|
||||
writer
|
||||
.write(
|
||||
&RecordBuilder::default()
|
||||
.version("1.0".to_owned())
|
||||
.warc_type(warc::RecordType::Request)
|
||||
.header(WarcHeader::TargetURI, res.request.uri().to_string())
|
||||
.header(WarcHeader::IPAddress, res.ip_address.to_string())
|
||||
.header(WarcHeader::ContentType, "application/http;msgtype=request")
|
||||
.body(warc_fields.into())
|
||||
.build()
|
||||
.unwrap(),
|
||||
)
|
||||
.unwrap();
|
||||
}
|
||||
}
|
||||
|
||||
fn format_http11_request(req: http::Request<&'static str>) -> String {
|
||||
let headers_str=req.headers().iter().map(|(key,val)| format!("{}: {}\n",key,val.to_str().unwrap())).collect::<String>();
|
||||
|
||||
format!(r#"{} {} HTTP/1.1
|
||||
{}"#, req.method().as_str(), req.uri().path(), headers_str)
|
||||
}
|
Loading…
Reference in a new issue