mirror of
https://github.com/Syfaro/fuzzysearch.git
synced 2024-11-10 17:02:38 +00:00
Initial commit.
This commit is contained in:
commit
f207d110a1
1
.gitignore
vendored
Normal file
1
.gitignore
vendored
Normal file
@ -0,0 +1 @@
|
|||||||
|
/target
|
2453
Cargo.lock
generated
Normal file
2453
Cargo.lock
generated
Normal file
File diff suppressed because it is too large
Load Diff
21
Cargo.toml
Normal file
21
Cargo.toml
Normal file
@ -0,0 +1,21 @@
|
|||||||
|
[package]
|
||||||
|
name = "e621-watcher"
|
||||||
|
version = "0.1.0"
|
||||||
|
authors = ["Syfaro <syfaro@huefox.com>"]
|
||||||
|
edition = "2018"
|
||||||
|
|
||||||
|
[dependencies]
|
||||||
|
reqwest = { version = "*", features = ["json"] }
|
||||||
|
tokio = { version = "0.2", features = ["full"] }
|
||||||
|
futures = { version = "*", features = ["thread-pool"] }
|
||||||
|
|
||||||
|
serde = "*"
|
||||||
|
serde_json = "*"
|
||||||
|
|
||||||
|
tokio-postgres = { version = "0.5.1", features = ["with-serde_json-1"] }
|
||||||
|
bb8 = { git = "https://github.com/khuey/bb8.git" }
|
||||||
|
bb8-postgres = { git = "https://github.com/khuey/bb8.git" }
|
||||||
|
|
||||||
|
furaffinity-rs = { git = "https://git.huefox.com/syfaro/furaffinity-rs.git" }
|
||||||
|
image = "0.22"
|
||||||
|
img_hash = "*"
|
125
src/bin/import.rs
Normal file
125
src/bin/import.rs
Normal file
@ -0,0 +1,125 @@
|
|||||||
|
async fn load_page(
|
||||||
|
client: &reqwest::Client,
|
||||||
|
before_id: Option<i32>,
|
||||||
|
) -> (Vec<i32>, serde_json::Value) {
|
||||||
|
println!("Loading page with before_id {:?}", before_id);
|
||||||
|
|
||||||
|
let mut query: Vec<(&'static str, String)> =
|
||||||
|
vec![("typed_tags", "true".into()), ("count", "320".into())];
|
||||||
|
|
||||||
|
if let Some(before_id) = before_id {
|
||||||
|
query.push(("before_id", before_id.to_string()));
|
||||||
|
if before_id == 1 {
|
||||||
|
panic!("that's it.");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
let body = client
|
||||||
|
.get("https://e621.net/post/index.json")
|
||||||
|
.query(&query)
|
||||||
|
.send()
|
||||||
|
.await
|
||||||
|
.expect("unable to make request")
|
||||||
|
.text()
|
||||||
|
.await
|
||||||
|
.expect("unable to convert to text");
|
||||||
|
|
||||||
|
let json = serde_json::from_str(&body).expect("Unable to parse data");
|
||||||
|
|
||||||
|
let posts = match json {
|
||||||
|
serde_json::Value::Array(ref arr) => arr,
|
||||||
|
_ => panic!("invalid response"),
|
||||||
|
};
|
||||||
|
|
||||||
|
let ids = posts
|
||||||
|
.iter()
|
||||||
|
.map(|post| {
|
||||||
|
let post = match post {
|
||||||
|
serde_json::Value::Object(post) => post,
|
||||||
|
_ => panic!("invalid post data"),
|
||||||
|
};
|
||||||
|
|
||||||
|
match post.get("id").expect("missing post id") {
|
||||||
|
serde_json::Value::Number(num) => {
|
||||||
|
num.as_i64().expect("invalid post id type") as i32
|
||||||
|
}
|
||||||
|
_ => panic!("invalid post id"),
|
||||||
|
}
|
||||||
|
})
|
||||||
|
.collect();
|
||||||
|
|
||||||
|
(ids, json)
|
||||||
|
}
|
||||||
|
|
||||||
|
#[tokio::main]
|
||||||
|
async fn main() {
|
||||||
|
let dsn = std::env::var("POSTGRES_DSN").expect("missing postgres dsn");
|
||||||
|
|
||||||
|
let (db, connection) = tokio_postgres::connect(&dsn, tokio_postgres::NoTls)
|
||||||
|
.await
|
||||||
|
.expect("Unable to connect");
|
||||||
|
|
||||||
|
tokio::spawn(async move {
|
||||||
|
if let Err(e) = connection.await {
|
||||||
|
eprintln!("connection error: {}", e);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
db.execute(
|
||||||
|
"CREATE TABLE IF NOT EXISTS post (id INTEGER PRIMARY KEY, data JSONB)",
|
||||||
|
&[],
|
||||||
|
)
|
||||||
|
.await
|
||||||
|
.expect("Unable to create table");
|
||||||
|
|
||||||
|
db.execute(
|
||||||
|
"CREATE OR REPLACE FUNCTION extract_post_data() RETURNS TRIGGER AS $$
|
||||||
|
BEGIN
|
||||||
|
NEW.id = NEW.data->'id';
|
||||||
|
RETURN NEW;
|
||||||
|
END $$
|
||||||
|
LANGUAGE 'plpgsql'",
|
||||||
|
&[],
|
||||||
|
)
|
||||||
|
.await
|
||||||
|
.expect("Unable to create function");
|
||||||
|
|
||||||
|
db.execute("DROP TRIGGER IF EXISTS call_extract_post_data ON post", &[])
|
||||||
|
.await
|
||||||
|
.expect("Unable to drop trigger");
|
||||||
|
db.execute("CREATE TRIGGER call_extract_post_data BEFORE INSERT ON post FOR EACH ROW EXECUTE PROCEDURE extract_post_data()", &[]).await.expect("Unable to create trigger");
|
||||||
|
|
||||||
|
let mut min_id = db
|
||||||
|
.query_one("SELECT MIN(id) FROM post", &[])
|
||||||
|
.await
|
||||||
|
.map(|row| row.get("min"))
|
||||||
|
.expect("Unable to get min post");
|
||||||
|
|
||||||
|
let client = reqwest::Client::builder()
|
||||||
|
.user_agent("Syfaro test client syfaro@huefox.com")
|
||||||
|
.build()
|
||||||
|
.expect("Unable to build http client");
|
||||||
|
|
||||||
|
let mut now;
|
||||||
|
|
||||||
|
loop {
|
||||||
|
now = std::time::Instant::now();
|
||||||
|
|
||||||
|
let (ids, post_data) = load_page(&client, min_id).await;
|
||||||
|
min_id = ids.into_iter().min();
|
||||||
|
|
||||||
|
db.execute(
|
||||||
|
"INSERT INTO post (data) SELECT json_array_elements($1::json)",
|
||||||
|
&[&post_data],
|
||||||
|
)
|
||||||
|
.await
|
||||||
|
.expect("Unable to insert");
|
||||||
|
|
||||||
|
let elapsed = now.elapsed().as_millis() as u64;
|
||||||
|
if elapsed < 1000 {
|
||||||
|
let delay = 1000 - elapsed;
|
||||||
|
println!("delaying {}ms before loading next page", delay);
|
||||||
|
tokio::time::delay_for(std::time::Duration::from_millis(delay)).await;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
110
src/bin/load_hashes.rs
Normal file
110
src/bin/load_hashes.rs
Normal file
@ -0,0 +1,110 @@
|
|||||||
|
use bb8::Pool;
|
||||||
|
use bb8_postgres::PostgresConnectionManager;
|
||||||
|
use futures::future::FutureExt;
|
||||||
|
|
||||||
|
struct NeededPost {
|
||||||
|
id: i32,
|
||||||
|
full_url: String,
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn hash_url(
|
||||||
|
client: std::sync::Arc<reqwest::Client>,
|
||||||
|
url: String,
|
||||||
|
) -> (img_hash::ImageHash, i64) {
|
||||||
|
println!("loading {}", url);
|
||||||
|
|
||||||
|
let data = client
|
||||||
|
.get(&url)
|
||||||
|
.send()
|
||||||
|
.await
|
||||||
|
.expect("unable to get url")
|
||||||
|
.bytes()
|
||||||
|
.await
|
||||||
|
.expect("unable to get bytes");
|
||||||
|
|
||||||
|
let hasher = furaffinity_rs::get_hasher();
|
||||||
|
let image = image::load_from_memory(&data).expect("unable to parse image");
|
||||||
|
|
||||||
|
let hash = hasher.hash_image(&image);
|
||||||
|
let mut bytes: [u8; 8] = [0; 8];
|
||||||
|
bytes.copy_from_slice(hash.as_bytes());
|
||||||
|
|
||||||
|
let num = i64::from_be_bytes(bytes);
|
||||||
|
|
||||||
|
println!("{} - {}", url, num);
|
||||||
|
|
||||||
|
(hash, num)
|
||||||
|
}
|
||||||
|
|
||||||
|
#[tokio::main]
|
||||||
|
async fn main() {
|
||||||
|
let dsn = std::env::var("POSTGRES_DSN").expect("missing postgres dsn");
|
||||||
|
|
||||||
|
use std::str::FromStr;
|
||||||
|
let manager = PostgresConnectionManager::new(
|
||||||
|
tokio_postgres::Config::from_str(&dsn).expect("unable to parse postgres dsn"),
|
||||||
|
tokio_postgres::NoTls,
|
||||||
|
);
|
||||||
|
|
||||||
|
let pool = match Pool::builder().build(manager).await {
|
||||||
|
Ok(pool) => pool,
|
||||||
|
Err(e) => panic!("unable to build pool: {}", e),
|
||||||
|
};
|
||||||
|
|
||||||
|
let client = reqwest::Client::builder()
|
||||||
|
.user_agent("Syfaro test client syfaro@huefox.com")
|
||||||
|
.build()
|
||||||
|
.expect("Unable to build http client");
|
||||||
|
let client = std::sync::Arc::new(client);
|
||||||
|
|
||||||
|
loop {
|
||||||
|
println!("getting next 100 posts");
|
||||||
|
|
||||||
|
let db = pool.clone();
|
||||||
|
|
||||||
|
let needed_posts: Vec<_> = db
|
||||||
|
.get()
|
||||||
|
.await
|
||||||
|
.unwrap()
|
||||||
|
.query(
|
||||||
|
"SELECT
|
||||||
|
id,
|
||||||
|
data->>'file_url' file_url
|
||||||
|
FROM
|
||||||
|
post
|
||||||
|
WHERE
|
||||||
|
hash IS NULL AND
|
||||||
|
data->>'file_ext' IN ('jpg', 'png') AND
|
||||||
|
data->>'file_url' <> '/images/deleted-preview.png'
|
||||||
|
LIMIT 100",
|
||||||
|
&[],
|
||||||
|
)
|
||||||
|
.await
|
||||||
|
.expect("Unable to get posts")
|
||||||
|
.into_iter()
|
||||||
|
.map(|row| NeededPost {
|
||||||
|
id: row.get("id"),
|
||||||
|
full_url: row.get("file_url"),
|
||||||
|
})
|
||||||
|
.collect();
|
||||||
|
|
||||||
|
for chunk in needed_posts.chunks(8) {
|
||||||
|
let futs = chunk.iter().map(|post| {
|
||||||
|
let db = db.clone();
|
||||||
|
let client = client.clone();
|
||||||
|
let id = post.id;
|
||||||
|
|
||||||
|
hash_url(client, post.full_url.clone()).then(move |(_hash, num)| async move {
|
||||||
|
db.get()
|
||||||
|
.await
|
||||||
|
.unwrap()
|
||||||
|
.execute("UPDATE post SET hash = $2 WHERE id = $1", &[&id, &num])
|
||||||
|
.await
|
||||||
|
.expect("Unable to update hash in database");
|
||||||
|
})
|
||||||
|
});
|
||||||
|
|
||||||
|
futures::future::join_all(futs).await;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
97
src/bin/query_hash.rs
Normal file
97
src/bin/query_hash.rs
Normal file
@ -0,0 +1,97 @@
|
|||||||
|
#[derive(Debug)]
|
||||||
|
struct Row {
|
||||||
|
id: i32,
|
||||||
|
artists: Vec<String>,
|
||||||
|
sources: Vec<String>,
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn get_hash_distance_from_url(
|
||||||
|
client: &reqwest::Client,
|
||||||
|
url: &str,
|
||||||
|
other: &img_hash::ImageHash,
|
||||||
|
) -> Result<u32, Box<dyn std::error::Error>> {
|
||||||
|
let data = client.get(url).send().await?.bytes().await?;
|
||||||
|
|
||||||
|
let hasher = furaffinity_rs::get_hasher();
|
||||||
|
let image = image::load_from_memory(&data)?;
|
||||||
|
|
||||||
|
let hash = hasher.hash_image(&image);
|
||||||
|
Ok(hash.dist(&other))
|
||||||
|
}
|
||||||
|
|
||||||
|
#[tokio::main]
|
||||||
|
async fn main() {
|
||||||
|
let dsn = std::env::var("POSTGRES_DSN").expect("missing postgres dsn");
|
||||||
|
let file = std::env::args().nth(1).expect("missing image");
|
||||||
|
|
||||||
|
let (db, connection) = tokio_postgres::connect(&dsn, tokio_postgres::NoTls)
|
||||||
|
.await
|
||||||
|
.expect("Unable to connect");
|
||||||
|
|
||||||
|
tokio::spawn(async move {
|
||||||
|
if let Err(e) = connection.await {
|
||||||
|
eprintln!("connection error: {}", e);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
let client = reqwest::Client::builder()
|
||||||
|
.user_agent("Syfaro test client syfaro@huefox.com")
|
||||||
|
.build()
|
||||||
|
.expect("Unable to build http client");
|
||||||
|
|
||||||
|
let image = image::open(&file).expect("unable to open image");
|
||||||
|
|
||||||
|
let hasher = furaffinity_rs::get_hasher();
|
||||||
|
let hash = hasher.hash_image(&image);
|
||||||
|
|
||||||
|
let mut bytes: [u8; 8] = [0; 8];
|
||||||
|
bytes.copy_from_slice(hash.as_bytes());
|
||||||
|
|
||||||
|
let num = i64::from_be_bytes(bytes);
|
||||||
|
|
||||||
|
let rows = db
|
||||||
|
.query(
|
||||||
|
"SELECT
|
||||||
|
post.id id,
|
||||||
|
artists_agg.artists artists,
|
||||||
|
sources_agg.sources sources
|
||||||
|
FROM
|
||||||
|
post,
|
||||||
|
LATERAL (
|
||||||
|
SELECT array_agg(v) artists FROM jsonb_array_elements_text(data->'artist') v
|
||||||
|
) artists_agg,
|
||||||
|
LATERAL (
|
||||||
|
SELECT array_agg(v) sources FROM jsonb_array_elements_text(data->'sources') v
|
||||||
|
) sources_agg
|
||||||
|
WHERE hash <@ ($1, 10)",
|
||||||
|
&[&num],
|
||||||
|
)
|
||||||
|
.await
|
||||||
|
.expect("unable to query")
|
||||||
|
.into_iter()
|
||||||
|
.map(|row| Row {
|
||||||
|
id: row.get("id"),
|
||||||
|
sources: row.get("sources"),
|
||||||
|
artists: row.get("artists"),
|
||||||
|
});
|
||||||
|
|
||||||
|
for row in rows {
|
||||||
|
println!(
|
||||||
|
"Possible match: https://e621.net/post/show/{} by {}",
|
||||||
|
row.id,
|
||||||
|
row.artists.join(", ")
|
||||||
|
);
|
||||||
|
for source in row.sources {
|
||||||
|
let distance = get_hash_distance_from_url(&client, &source, &hash).await;
|
||||||
|
println!(
|
||||||
|
"- {} (distance of {})",
|
||||||
|
source,
|
||||||
|
if let Ok(d) = distance {
|
||||||
|
d.to_string()
|
||||||
|
} else {
|
||||||
|
"unknown".to_string()
|
||||||
|
}
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
Loading…
Reference in New Issue
Block a user