fuzzysearch/fuzzysearch-ingest-weasyl/src/main.rs

214 lines
5.4 KiB
Rust
Raw Normal View History

2020-10-09 18:02:58 +00:00
use serde::{Deserialize, Serialize};
use sha2::{Digest, Sha256};
#[derive(Debug, Serialize, Deserialize)]
struct WeasylMediaSubmission {
#[serde(rename = "mediaid")]
id: i32,
url: String,
}
#[derive(Debug, Serialize, Deserialize)]
struct WeasylMedia {
submission: Vec<WeasylMediaSubmission>,
}
#[derive(Debug, Serialize, Deserialize, PartialEq)]
#[serde(rename_all = "lowercase")]
enum WeasylSubmissionSubtype {
Multimedia,
Visual,
Literary,
}
#[derive(Debug, Serialize, Deserialize)]
struct WeasylSubmission {
#[serde(rename = "submitid")]
id: i32,
media: WeasylMedia,
subtype: WeasylSubmissionSubtype,
}
#[derive(Debug, Serialize, Deserialize)]
struct WeasylFrontpageSubmission {
#[serde(rename = "submitid")]
id: i32,
}
#[derive(Debug, Serialize, Deserialize)]
struct WeasylError {
name: String,
}
#[derive(Debug, Serialize, Deserialize)]
#[serde(untagged)]
enum WeasylResponse<T> {
Error { error: WeasylError },
Response(T),
}
async fn load_frontpage(client: &reqwest::Client, api_key: &str) -> anyhow::Result<i32> {
2020-10-14 15:33:29 +00:00
let resp: WeasylResponse<Vec<serde_json::Value>> = client
2020-10-09 18:02:58 +00:00
.get("https://www.weasyl.com/api/submissions/frontpage")
.header("X-Weasyl-API-Key", api_key)
.send()
.await?
.json()
.await?;
let subs = match resp {
WeasylResponse::Response(subs) => subs,
WeasylResponse::Error {
error: WeasylError { name },
} => return Err(anyhow::anyhow!(name)),
};
2020-10-14 15:33:29 +00:00
let max = subs
.into_iter()
.filter_map(|sub| sub.get("submitid").and_then(|id| id.as_i64()))
.max()
.unwrap_or_default();
2020-10-09 18:02:58 +00:00
2020-10-14 15:33:29 +00:00
Ok(max as i32)
2020-10-09 18:02:58 +00:00
}
async fn load_submission(
client: &reqwest::Client,
api_key: &str,
id: i32,
2020-10-09 19:04:59 +00:00
) -> anyhow::Result<(Option<WeasylSubmission>, serde_json::Value)> {
2020-10-09 18:02:58 +00:00
println!("Loading submission {}", id);
2020-10-09 19:04:59 +00:00
let body: serde_json::Value = client
2020-10-09 18:02:58 +00:00
.get(&format!(
"https://www.weasyl.com/api/submissions/{}/view",
id
))
.header("X-Weasyl-API-Key", api_key)
.send()
.await?
.json()
2020-10-09 19:04:59 +00:00
.await?;
2020-10-09 18:58:28 +00:00
2020-10-09 19:04:59 +00:00
let data: WeasylResponse<WeasylSubmission> = match serde_json::from_value(body.clone()) {
Ok(data) => data,
Err(_err) => return Ok((None, body)),
2020-10-09 18:58:28 +00:00
};
2020-10-09 18:02:58 +00:00
let res = match data {
WeasylResponse::Response(sub) if sub.subtype == WeasylSubmissionSubtype::Visual => {
2020-10-09 19:04:59 +00:00
Some(sub)
2020-10-09 18:02:58 +00:00
}
WeasylResponse::Response(_sub) => None,
WeasylResponse::Error {
error: WeasylError { name },
} if name == "submissionRecordMissing" => None,
WeasylResponse::Error {
error: WeasylError { name },
} => return Err(anyhow::anyhow!(name)),
};
2020-10-09 19:04:59 +00:00
Ok((res, body))
2020-10-09 18:02:58 +00:00
}
async fn process_submission(
pool: &sqlx::Pool<sqlx::Postgres>,
client: &reqwest::Client,
body: serde_json::Value,
sub: WeasylSubmission,
) -> anyhow::Result<()> {
println!("Processing submission {}", sub.id);
let data = client
.get(&sub.media.submission.first().unwrap().url)
.send()
.await?
.bytes()
.await?;
2020-10-09 23:10:44 +00:00
let num = if let Ok(image) = image::load_from_memory(&data) {
2020-12-08 00:17:38 +00:00
let hasher = fuzzysearch_common::get_hasher();
2020-10-09 23:10:44 +00:00
let hash = hasher.hash_image(&image);
let mut bytes: [u8; 8] = [0; 8];
bytes.copy_from_slice(hash.as_bytes());
let num = i64::from_be_bytes(bytes);
Some(num)
} else {
println!("Unable to decode image on submission {}", sub.id);
None
};
2020-10-09 18:02:58 +00:00
let mut hasher = Sha256::new();
hasher.update(&data);
let result: [u8; 32] = hasher.finalize().into();
sqlx::query!(
"INSERT INTO weasyl (id, hash, sha256, file_size, data) VALUES ($1, $2, $3, $4, $5)",
sub.id,
num,
result.to_vec(),
data.len() as i32,
body
)
.execute(pool)
.await?;
Ok(())
}
2020-10-09 19:04:59 +00:00
async fn insert_null(
pool: &sqlx::Pool<sqlx::Postgres>,
body: serde_json::Value,
id: i32,
) -> anyhow::Result<()> {
2020-10-09 18:02:58 +00:00
println!("Inserting null for submission {}", id);
2020-10-09 19:04:59 +00:00
sqlx::query!("INSERT INTO WEASYL (id, data) VALUES ($1, $2)", id, body)
2020-10-09 18:02:58 +00:00
.execute(pool)
.await?;
Ok(())
}
#[tokio::main]
async fn main() {
let api_key = std::env::var("WEASYL_APIKEY").unwrap();
let pool = sqlx::postgres::PgPoolOptions::new()
.max_connections(2)
.connect(&std::env::var("DATABASE_URL").unwrap())
.await
.unwrap();
let client = reqwest::Client::new();
2020-12-08 00:17:38 +00:00
loop {
let min = sqlx::query!("SELECT max(id) id FROM weasyl")
.fetch_one(&pool)
2020-10-09 18:02:58 +00:00
.await
2020-12-08 00:17:38 +00:00
.unwrap()
.id
.unwrap_or_default();
let max = load_frontpage(&client, &api_key).await.unwrap();
for id in (min + 1)..=max {
let row: Option<_> = sqlx::query!("SELECT id FROM weasyl WHERE id = $1", id)
.fetch_optional(&pool)
.await
.unwrap();
if row.is_some() {
continue;
}
match load_submission(&client, &api_key, id).await.unwrap() {
(Some(sub), json) => process_submission(&pool, &client, json, sub).await.unwrap(),
(None, body) => insert_null(&pool, body, id).await.unwrap(),
}
2020-10-09 18:02:58 +00:00
}
2021-02-21 02:16:58 +00:00
tokio::time::sleep(std::time::Duration::from_secs(60 * 5)).await;
2020-10-09 18:02:58 +00:00
}
}