fuzzysearch/src/bin/load_hashes.rs

131 lines
3.9 KiB
Rust
Raw Normal View History

2020-01-12 05:07:01 +00:00
use bb8::Pool;
use bb8_postgres::PostgresConnectionManager;
use futures::future::FutureExt;
struct NeededPost {
id: i32,
full_url: String,
}
async fn hash_url(
client: std::sync::Arc<reqwest::Client>,
url: String,
2020-01-12 05:56:37 +00:00
) -> Result<(img_hash::ImageHash, i64), image::ImageError> {
2020-01-12 05:07:01 +00:00
println!("loading {}", url);
let data = client
.get(&url)
.send()
.await
.expect("unable to get url")
.bytes()
.await
.expect("unable to get bytes");
let hasher = furaffinity_rs::get_hasher();
2020-01-12 05:56:37 +00:00
let image = image::load_from_memory(&data)?;
2020-01-12 05:07:01 +00:00
let hash = hasher.hash_image(&image);
let mut bytes: [u8; 8] = [0; 8];
bytes.copy_from_slice(hash.as_bytes());
let num = i64::from_be_bytes(bytes);
println!("{} - {}", url, num);
2020-01-12 05:56:37 +00:00
Ok((hash, num))
2020-01-12 05:07:01 +00:00
}
#[tokio::main]
async fn main() {
let dsn = std::env::var("POSTGRES_DSN").expect("missing postgres dsn");
use std::str::FromStr;
let manager = PostgresConnectionManager::new(
tokio_postgres::Config::from_str(&dsn).expect("unable to parse postgres dsn"),
tokio_postgres::NoTls,
);
let pool = match Pool::builder().build(manager).await {
Ok(pool) => pool,
Err(e) => panic!("unable to build pool: {}", e),
};
let client = reqwest::Client::builder()
.user_agent("Syfaro test client syfaro@huefox.com")
.build()
.expect("Unable to build http client");
let client = std::sync::Arc::new(client);
loop {
println!("getting next 100 posts");
let db = pool.clone();
let needed_posts: Vec<_> = db
.get()
.await
.unwrap()
.query(
"SELECT
id,
data->>'file_url' file_url
FROM
post
WHERE
hash IS NULL AND
hash_error IS NULL AND
2020-01-12 05:07:01 +00:00
data->>'file_ext' IN ('jpg', 'png') AND
data->>'file_url' <> '/images/deleted-preview.png'
ORDER BY id DESC
2020-01-12 05:07:01 +00:00
LIMIT 100",
&[],
)
.await
.expect("Unable to get posts")
.into_iter()
.map(|row| NeededPost {
id: row.get("id"),
full_url: row.get("file_url"),
})
.collect();
for chunk in needed_posts.chunks(8) {
let futs = chunk.iter().map(|post| {
let db = db.clone();
let client = client.clone();
let id = post.id;
2020-01-12 05:56:37 +00:00
hash_url(client, post.full_url.clone()).then(move |res| async move {
match res {
Ok((_hash, num)) => {
db.get()
.await
.unwrap()
.execute("UPDATE post SET hash = $2 WHERE id = $1", &[&id, &num])
.await
.expect("Unable to update hash in database");
}
Err(e) => {
use std::error::Error;
let desc = e.description();
println!("hashing error - {}", desc);
db.get()
.await
.unwrap()
.execute(
"UPDATE post SET hash_error = $2 WHERE id = $1",
&[&id, &desc],
)
.await
.expect("Unable to update hash in database");
}
};
2020-01-12 05:07:01 +00:00
})
});
futures::future::join_all(futs).await;
}
}
}