mirror of
https://github.com/Syfaro/fuzzysearch.git
synced 2024-11-23 15:22:31 +00:00
Somewhat untested changes for new e621 API.
This commit is contained in:
parent
ceeef0b9e9
commit
2a2c3b88fc
2249
Cargo.lock
generated
2249
Cargo.lock
generated
File diff suppressed because it is too large
Load Diff
@ -17,6 +17,9 @@ tokio-postgres = { version = "0.5.1", features = ["with-serde_json-1"] }
|
|||||||
bb8 = { git = "https://github.com/khuey/bb8.git" }
|
bb8 = { git = "https://github.com/khuey/bb8.git" }
|
||||||
bb8-postgres = { git = "https://github.com/khuey/bb8.git" }
|
bb8-postgres = { git = "https://github.com/khuey/bb8.git" }
|
||||||
|
|
||||||
furaffinity-rs = { git = "https://git.huefox.com/syfaro/furaffinity-rs.git" }
|
furaffinity-rs = { git = "https://github.com/Syfaro/furaffinity-rs.git" }
|
||||||
image = "0.22"
|
image = "0.23"
|
||||||
img_hash = "*"
|
img_hash = "*"
|
||||||
|
|
||||||
|
[profile.dev.package."*"]
|
||||||
|
opt-level = 2
|
||||||
|
@ -4,18 +4,17 @@ async fn load_page(
|
|||||||
) -> (Vec<i32>, serde_json::Value) {
|
) -> (Vec<i32>, serde_json::Value) {
|
||||||
println!("Loading page with before_id {:?}", before_id);
|
println!("Loading page with before_id {:?}", before_id);
|
||||||
|
|
||||||
let mut query: Vec<(&'static str, String)> =
|
let mut query: Vec<(&'static str, String)> = vec![("limit", "320".into())];
|
||||||
vec![("typed_tags", "true".into()), ("count", "320".into())];
|
|
||||||
|
|
||||||
if let Some(before_id) = before_id {
|
if let Some(before_id) = before_id {
|
||||||
query.push(("before_id", before_id.to_string()));
|
query.push(("page", format!("b{}", before_id)));
|
||||||
if before_id <= 14 {
|
if before_id <= 14 {
|
||||||
panic!("that's it.");
|
panic!("that's it.");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
let body = client
|
let body = client
|
||||||
.get("https://e621.net/post/index.json")
|
.get("https://e621.net/posts.json")
|
||||||
.query(&query)
|
.query(&query)
|
||||||
.send()
|
.send()
|
||||||
.await
|
.await
|
||||||
@ -26,11 +25,17 @@ async fn load_page(
|
|||||||
|
|
||||||
let json = serde_json::from_str(&body).expect("Unable to parse data");
|
let json = serde_json::from_str(&body).expect("Unable to parse data");
|
||||||
|
|
||||||
let posts = match json {
|
let page = match json {
|
||||||
serde_json::Value::Array(ref arr) => arr,
|
serde_json::Value::Object(ref obj) => obj,
|
||||||
_ => panic!("invalid response"),
|
_ => panic!("top level value was not object"),
|
||||||
};
|
};
|
||||||
|
|
||||||
|
let posts = page
|
||||||
|
.get("posts")
|
||||||
|
.expect("unable to get posts object")
|
||||||
|
.as_array()
|
||||||
|
.expect("posts was not array");
|
||||||
|
|
||||||
let ids = posts
|
let ids = posts
|
||||||
.iter()
|
.iter()
|
||||||
.map(|post| {
|
.map(|post| {
|
||||||
@ -48,7 +53,7 @@ async fn load_page(
|
|||||||
})
|
})
|
||||||
.collect();
|
.collect();
|
||||||
|
|
||||||
(ids, json)
|
(ids, serde_json::Value::Array(posts.to_vec()))
|
||||||
}
|
}
|
||||||
|
|
||||||
#[tokio::main]
|
#[tokio::main]
|
||||||
|
@ -10,7 +10,7 @@ struct NeededPost {
|
|||||||
async fn hash_url(
|
async fn hash_url(
|
||||||
client: std::sync::Arc<reqwest::Client>,
|
client: std::sync::Arc<reqwest::Client>,
|
||||||
url: String,
|
url: String,
|
||||||
) -> Result<(img_hash::ImageHash, i64), image::ImageError> {
|
) -> Result<(img_hash::ImageHash<[u8; 8]>, i64), image::ImageError> {
|
||||||
let data = client
|
let data = client
|
||||||
.get(&url)
|
.get(&url)
|
||||||
.send()
|
.send()
|
||||||
@ -21,7 +21,13 @@ async fn hash_url(
|
|||||||
.expect("unable to get bytes");
|
.expect("unable to get bytes");
|
||||||
|
|
||||||
let hasher = furaffinity_rs::get_hasher();
|
let hasher = furaffinity_rs::get_hasher();
|
||||||
let image = image::load_from_memory(&data)?;
|
let image = match image::load_from_memory(&data) {
|
||||||
|
Ok(image) => image,
|
||||||
|
Err(e) => {
|
||||||
|
println!("{:?}", &data[0..50]);
|
||||||
|
return Err(e);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
let hash = hasher.hash_image(&image);
|
let hash = hasher.hash_image(&image);
|
||||||
let mut bytes: [u8; 8] = [0; 8];
|
let mut bytes: [u8; 8] = [0; 8];
|
||||||
@ -43,14 +49,14 @@ async fn load_next_posts(
|
|||||||
.query(
|
.query(
|
||||||
"SELECT
|
"SELECT
|
||||||
id,
|
id,
|
||||||
data->>'file_url' file_url
|
data->'file'->>'url' file_url
|
||||||
FROM
|
FROM
|
||||||
e621
|
e621
|
||||||
WHERE
|
WHERE
|
||||||
hash IS NULL AND
|
hash IS NULL AND
|
||||||
hash_error IS NULL AND
|
hash_error IS NULL AND
|
||||||
data->>'file_ext' IN ('jpg', 'png') AND
|
data->'file'->>'ext' IN ('jpg', 'png') AND
|
||||||
data->>'file_url' <> '/images/deleted-preview.png'
|
data->'file'->>'url' <> '/images/deleted-preview.png'
|
||||||
ORDER BY id DESC
|
ORDER BY id DESC
|
||||||
LIMIT 384",
|
LIMIT 384",
|
||||||
&[],
|
&[],
|
||||||
@ -106,17 +112,31 @@ async fn main() {
|
|||||||
hash_url(client, post.full_url.clone()).then(move |res| async move {
|
hash_url(client, post.full_url.clone()).then(move |res| async move {
|
||||||
match res {
|
match res {
|
||||||
Ok((_hash, num)) => {
|
Ok((_hash, num)) => {
|
||||||
db.get()
|
let mut conn = db.get().await.unwrap();
|
||||||
|
|
||||||
|
let tx = conn
|
||||||
|
.transaction()
|
||||||
.await
|
.await
|
||||||
.unwrap()
|
.expect("Unable to create transaction");
|
||||||
.execute("UPDATE e621 SET hash = $2 WHERE id = $1", &[&id, &num])
|
|
||||||
|
tx.execute("UPDATE e621 SET hash = $2 WHERE id = $1", &[&id, &num])
|
||||||
.await
|
.await
|
||||||
.expect("Unable to update hash in database");
|
.expect("Unable to update hash in database");
|
||||||
|
|
||||||
|
tx.execute(
|
||||||
|
"INSERT INTO hashes (e621_id, hash) VALUES ($1, $2)",
|
||||||
|
&[&id, &num],
|
||||||
|
)
|
||||||
|
.await
|
||||||
|
.expect("Unable to insert hash to hashes table");
|
||||||
|
|
||||||
|
tx.commit().await.expect("Unable to commit tx");
|
||||||
|
|
||||||
|
drop(conn);
|
||||||
}
|
}
|
||||||
Err(e) => {
|
Err(e) => {
|
||||||
use std::error::Error;
|
let desc = e.to_string();
|
||||||
let desc = e.description();
|
println!("[{}] hashing error - {}", id, desc);
|
||||||
println!("hashing error - {}", desc);
|
|
||||||
db.get()
|
db.get()
|
||||||
.await
|
.await
|
||||||
.unwrap()
|
.unwrap()
|
||||||
@ -131,7 +151,11 @@ async fn main() {
|
|||||||
})
|
})
|
||||||
});
|
});
|
||||||
|
|
||||||
|
println!("joining futs");
|
||||||
|
|
||||||
futures::future::join_all(futs).await;
|
futures::future::join_all(futs).await;
|
||||||
|
|
||||||
|
println!("futs completed");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -1,124 +0,0 @@
|
|||||||
#[derive(Debug)]
|
|
||||||
struct Row {
|
|
||||||
id: i32,
|
|
||||||
artists: Option<Vec<String>>,
|
|
||||||
sources: Option<Vec<String>>,
|
|
||||||
distance: Option<u64>,
|
|
||||||
}
|
|
||||||
|
|
||||||
async fn get_hash_distance_from_url(
|
|
||||||
client: &reqwest::Client,
|
|
||||||
url: &str,
|
|
||||||
other: &img_hash::ImageHash,
|
|
||||||
) -> Result<u32, Box<dyn std::error::Error>> {
|
|
||||||
let data = client.get(url).send().await?.bytes().await?;
|
|
||||||
|
|
||||||
let hasher = furaffinity_rs::get_hasher();
|
|
||||||
let image = image::load_from_memory(&data)?;
|
|
||||||
|
|
||||||
let hash = hasher.hash_image(&image);
|
|
||||||
Ok(hash.dist(&other))
|
|
||||||
}
|
|
||||||
|
|
||||||
#[tokio::main]
|
|
||||||
async fn main() {
|
|
||||||
let dsn = std::env::var("POSTGRES_DSN").expect("missing postgres dsn");
|
|
||||||
let file = std::env::args().nth(1).expect("missing image");
|
|
||||||
|
|
||||||
let (db, connection) = tokio_postgres::connect(&dsn, tokio_postgres::NoTls)
|
|
||||||
.await
|
|
||||||
.expect("Unable to connect");
|
|
||||||
|
|
||||||
tokio::spawn(async move {
|
|
||||||
if let Err(e) = connection.await {
|
|
||||||
eprintln!("connection error: {}", e);
|
|
||||||
}
|
|
||||||
});
|
|
||||||
|
|
||||||
let client = reqwest::Client::builder()
|
|
||||||
.user_agent("Syfaro test client syfaro@huefox.com")
|
|
||||||
.build()
|
|
||||||
.expect("Unable to build http client");
|
|
||||||
|
|
||||||
let image = image::open(&file).expect("unable to open image");
|
|
||||||
|
|
||||||
let hasher = furaffinity_rs::get_hasher();
|
|
||||||
let hash = hasher.hash_image(&image);
|
|
||||||
|
|
||||||
let mut bytes: [u8; 8] = [0; 8];
|
|
||||||
bytes.copy_from_slice(hash.as_bytes());
|
|
||||||
|
|
||||||
let num = i64::from_be_bytes(bytes);
|
|
||||||
|
|
||||||
let rows = db
|
|
||||||
.query(
|
|
||||||
"SELECT
|
|
||||||
post.id id,
|
|
||||||
post.hash hash,
|
|
||||||
artists_agg.artists artists,
|
|
||||||
sources_agg.sources sources
|
|
||||||
FROM
|
|
||||||
e621,
|
|
||||||
LATERAL (
|
|
||||||
SELECT array_agg(v) artists FROM jsonb_array_elements_text(data->'artist') v
|
|
||||||
) artists_agg,
|
|
||||||
LATERAL (
|
|
||||||
SELECT array_agg(v) sources FROM jsonb_array_elements_text(data->'sources') v
|
|
||||||
) sources_agg
|
|
||||||
WHERE hash <@ ($1, 10)",
|
|
||||||
&[&num],
|
|
||||||
)
|
|
||||||
.await
|
|
||||||
.expect("unable to query")
|
|
||||||
.into_iter()
|
|
||||||
.map(|row| {
|
|
||||||
let distance = row
|
|
||||||
.get::<&str, Option<i64>>("hash")
|
|
||||||
.map(|hash| hamming::distance_fast(&hash.to_be_bytes(), &bytes).unwrap());
|
|
||||||
|
|
||||||
Row {
|
|
||||||
id: row.get("id"),
|
|
||||||
sources: row.get("sources"),
|
|
||||||
artists: row.get("artists"),
|
|
||||||
distance,
|
|
||||||
}
|
|
||||||
});
|
|
||||||
|
|
||||||
for row in rows {
|
|
||||||
let dist = row.distance.unwrap_or_else(u64::max_value);
|
|
||||||
if dist > 5 {
|
|
||||||
println!(
|
|
||||||
"Skipping https://e621.net/post/show/{}, distance too high: {}",
|
|
||||||
row.id, dist
|
|
||||||
);
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
println!(
|
|
||||||
"Possible match: [distance of {}] https://e621.net/post/show/{} by {}",
|
|
||||||
dist,
|
|
||||||
row.id,
|
|
||||||
row.artists
|
|
||||||
.map(|artists| artists.join(", "))
|
|
||||||
.unwrap_or_else(|| "unknown".to_string())
|
|
||||||
);
|
|
||||||
let sources = match row.sources {
|
|
||||||
Some(source) => source,
|
|
||||||
_ => {
|
|
||||||
println!("no sources");
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
};
|
|
||||||
for source in sources {
|
|
||||||
let distance = get_hash_distance_from_url(&client, &source, &hash).await;
|
|
||||||
println!(
|
|
||||||
"- {} (distance of {})",
|
|
||||||
source,
|
|
||||||
if let Ok(d) = distance {
|
|
||||||
d.to_string()
|
|
||||||
} else {
|
|
||||||
"unknown".to_string()
|
|
||||||
}
|
|
||||||
);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
@ -1,18 +1,11 @@
|
|||||||
async fn load_page(
|
async fn load_page(client: &reqwest::Client, after_id: i32) -> (Vec<i32>, Vec<serde_json::Value>) {
|
||||||
client: &reqwest::Client,
|
println!("Loading page with after_id {:?}", after_id);
|
||||||
before_id: Option<i32>,
|
|
||||||
) -> (Vec<i32>, serde_json::Value) {
|
|
||||||
println!("Loading page with before_id {:?}", before_id);
|
|
||||||
|
|
||||||
let mut query: Vec<(&'static str, String)> =
|
let mut query: Vec<(&'static str, String)> = vec![("limit", "320".into())];
|
||||||
vec![("typed_tags", "true".into()), ("count", "320".into())];
|
query.push(("page", format!("a{}", after_id)));
|
||||||
|
|
||||||
if let Some(before_id) = before_id {
|
|
||||||
query.push(("before_id", before_id.to_string()));
|
|
||||||
}
|
|
||||||
|
|
||||||
let body = client
|
let body = client
|
||||||
.get("https://e621.net/post/index.json")
|
.get("https://e621.net/posts.json")
|
||||||
.query(&query)
|
.query(&query)
|
||||||
.send()
|
.send()
|
||||||
.await
|
.await
|
||||||
@ -23,11 +16,17 @@ async fn load_page(
|
|||||||
|
|
||||||
let json = serde_json::from_str(&body).expect("Unable to parse data");
|
let json = serde_json::from_str(&body).expect("Unable to parse data");
|
||||||
|
|
||||||
let posts = match json {
|
let page = match json {
|
||||||
serde_json::Value::Array(ref arr) => arr,
|
serde_json::Value::Object(ref obj) => obj,
|
||||||
_ => panic!("invalid response"),
|
_ => panic!("top level value was not object"),
|
||||||
};
|
};
|
||||||
|
|
||||||
|
let posts = page
|
||||||
|
.get("posts")
|
||||||
|
.expect("unable to get posts object")
|
||||||
|
.as_array()
|
||||||
|
.expect("posts was not array");
|
||||||
|
|
||||||
let ids = posts
|
let ids = posts
|
||||||
.iter()
|
.iter()
|
||||||
.map(|post| {
|
.map(|post| {
|
||||||
@ -45,14 +44,62 @@ async fn load_page(
|
|||||||
})
|
})
|
||||||
.collect();
|
.collect();
|
||||||
|
|
||||||
(ids, json)
|
(ids, posts.to_vec())
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn get_latest_id(client: &reqwest::Client) -> i32 {
|
||||||
|
println!("Looking up current highest ID");
|
||||||
|
|
||||||
|
let query = vec![("limit", "1")];
|
||||||
|
|
||||||
|
let body = client
|
||||||
|
.get("https://e621.net/posts.json")
|
||||||
|
.query(&query)
|
||||||
|
.send()
|
||||||
|
.await
|
||||||
|
.expect("unable to make request")
|
||||||
|
.text()
|
||||||
|
.await
|
||||||
|
.expect("unable to convert to text");
|
||||||
|
|
||||||
|
let json = serde_json::from_str(&body).expect("Unable to parse data");
|
||||||
|
|
||||||
|
let page = match json {
|
||||||
|
serde_json::Value::Object(ref obj) => obj,
|
||||||
|
_ => panic!("top level value was not object"),
|
||||||
|
};
|
||||||
|
|
||||||
|
let posts = page
|
||||||
|
.get("posts")
|
||||||
|
.expect("unable to get posts object")
|
||||||
|
.as_array()
|
||||||
|
.expect("posts was not array");
|
||||||
|
|
||||||
|
let ids: Vec<i32> = posts
|
||||||
|
.iter()
|
||||||
|
.map(|post| {
|
||||||
|
let post = match post {
|
||||||
|
serde_json::Value::Object(post) => post,
|
||||||
|
_ => panic!("invalid post data"),
|
||||||
|
};
|
||||||
|
|
||||||
|
match post.get("id").expect("missing post id") {
|
||||||
|
serde_json::Value::Number(num) => {
|
||||||
|
num.as_i64().expect("invalid post id type") as i32
|
||||||
|
}
|
||||||
|
_ => panic!("invalid post id"),
|
||||||
|
}
|
||||||
|
})
|
||||||
|
.collect();
|
||||||
|
|
||||||
|
ids.into_iter().max().expect("no ids found")
|
||||||
}
|
}
|
||||||
|
|
||||||
#[tokio::main]
|
#[tokio::main]
|
||||||
async fn main() {
|
async fn main() {
|
||||||
let dsn = std::env::var("POSTGRES_DSN").expect("missing postgres dsn");
|
let dsn = std::env::var("POSTGRES_DSN").expect("missing postgres dsn");
|
||||||
|
|
||||||
let (db, connection) = tokio_postgres::connect(&dsn, tokio_postgres::NoTls)
|
let (mut db, connection) = tokio_postgres::connect(&dsn, tokio_postgres::NoTls)
|
||||||
.await
|
.await
|
||||||
.expect("Unable to connect");
|
.expect("Unable to connect");
|
||||||
|
|
||||||
@ -76,27 +123,42 @@ async fn main() {
|
|||||||
println!("max is id: {}", max_id);
|
println!("max is id: {}", max_id);
|
||||||
|
|
||||||
let mut now;
|
let mut now;
|
||||||
let mut min_id: Option<i32> = None;
|
|
||||||
|
// Start with the minimum ID we're requesting being our previous highest
|
||||||
|
// ID found.
|
||||||
|
let mut min_id = max_id;
|
||||||
|
|
||||||
|
// Find highest ID to look for. Once we get this value back, we've gotten
|
||||||
|
// as many new posts as we were looking for.
|
||||||
|
let latest_id = get_latest_id(&client).await;
|
||||||
|
|
||||||
loop {
|
loop {
|
||||||
now = std::time::Instant::now();
|
now = std::time::Instant::now();
|
||||||
|
|
||||||
|
// Load any posts with an ID higher than our previous run.
|
||||||
let (ids, post_data) = load_page(&client, min_id).await;
|
let (ids, post_data) = load_page(&client, min_id).await;
|
||||||
min_id = ids.into_iter().min();
|
|
||||||
|
|
||||||
db.execute(
|
// Calculate a new minimum value to find posts after by looking at the
|
||||||
"INSERT INTO e621 (data) SELECT json_array_elements($1::json) ON CONFLICT DO NOTHING",
|
// maximum value returned in this run.
|
||||||
&[&post_data],
|
min_id = *ids.iter().max().expect("no ids found");
|
||||||
)
|
|
||||||
.await
|
|
||||||
.expect("Unable to insert");
|
|
||||||
|
|
||||||
if let Some(min_id) = min_id {
|
let tx = db.transaction().await.expect("unable to start transaction");
|
||||||
println!("min id is: {}", min_id);
|
|
||||||
if min_id <= max_id {
|
for post in post_data {
|
||||||
println!("finished run, {}, {}", min_id, max_id);
|
tx.execute(
|
||||||
break;
|
"INSERT INTO e621 (data) VALUES ($1::json) ON CONFLICT DO NOTHING",
|
||||||
}
|
&[&post],
|
||||||
|
)
|
||||||
|
.await
|
||||||
|
.expect("Unable to insert");
|
||||||
|
}
|
||||||
|
|
||||||
|
tx.commit().await.expect("unable to commit transaction");
|
||||||
|
|
||||||
|
// If it contains the latest ID, we're done.
|
||||||
|
if ids.contains(&latest_id) {
|
||||||
|
println!("finished run, latest_id {}, max_id {}", latest_id, max_id);
|
||||||
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
let elapsed = now.elapsed().as_millis() as u64;
|
let elapsed = now.elapsed().as_millis() as u64;
|
||||||
|
Loading…
Reference in New Issue
Block a user