mirror of
https://github.com/Syfaro/fuzzysearch.git
synced 2024-11-23 15:22:31 +00:00
Add 'fuzzysearch-ingest-weasyl/' from commit '6baf256f09d44c7ac19dd1cdc956fbebae1ffedf'
git-subtree-dir: fuzzysearch-ingest-weasyl git-subtree-mainline:348352f23e
git-subtree-split:6baf256f09
This commit is contained in:
commit
1fda51cb5d
45
fuzzysearch-ingest-weasyl/.drone.yml
Normal file
45
fuzzysearch-ingest-weasyl/.drone.yml
Normal file
@ -0,0 +1,45 @@
|
|||||||
|
---
|
||||||
|
kind: pipeline
|
||||||
|
type: docker
|
||||||
|
name: default
|
||||||
|
|
||||||
|
platform:
|
||||||
|
os: linux
|
||||||
|
arch: amd64
|
||||||
|
|
||||||
|
steps:
|
||||||
|
- name: build
|
||||||
|
pull: always
|
||||||
|
image: rust:1.47-slim-buster
|
||||||
|
commands:
|
||||||
|
- apt-get update -y
|
||||||
|
- apt-get install -y wget libssl-dev pkg-config
|
||||||
|
- wget -O sccache.tar.gz https://github.com/mozilla/sccache/releases/download/0.2.13/sccache-0.2.13-x86_64-unknown-linux-musl.tar.gz
|
||||||
|
- tar zxvf sccache.tar.gz
|
||||||
|
- export RUSTC_WRAPPER=$(pwd)/sccache-0.2.13-x86_64-unknown-linux-musl/sccache
|
||||||
|
- cargo build --release
|
||||||
|
- $(pwd)/sccache-0.2.13-x86_64-unknown-linux-musl/sccache --show-stats
|
||||||
|
- cp ./target/release/weasyl-watcher ./weasyl-watcher
|
||||||
|
environment:
|
||||||
|
AWS_ACCESS_KEY_ID:
|
||||||
|
from_secret: sccache_s3_access_key
|
||||||
|
AWS_SECRET_ACCESS_KEY:
|
||||||
|
from_secret: sccache_s3_secret_key
|
||||||
|
SCCACHE_BUCKET: cache
|
||||||
|
SCCACHE_ENDPOINT:
|
||||||
|
from_secret: sccache_s3_endpoint
|
||||||
|
SCCACHE_S3_USE_SSL: true
|
||||||
|
|
||||||
|
- name: docker
|
||||||
|
image: plugins/docker
|
||||||
|
settings:
|
||||||
|
tags:
|
||||||
|
- latest
|
||||||
|
password:
|
||||||
|
from_secret: docker_password
|
||||||
|
registry: registry.huefox.com
|
||||||
|
repo: registry.huefox.com/weasyl-watcher
|
||||||
|
username:
|
||||||
|
from_secret: docker_username
|
||||||
|
|
||||||
|
...
|
1
fuzzysearch-ingest-weasyl/.gitignore
vendored
Normal file
1
fuzzysearch-ingest-weasyl/.gitignore
vendored
Normal file
@ -0,0 +1 @@
|
|||||||
|
/target
|
1989
fuzzysearch-ingest-weasyl/Cargo.lock
generated
Normal file
1989
fuzzysearch-ingest-weasyl/Cargo.lock
generated
Normal file
File diff suppressed because it is too large
Load Diff
27
fuzzysearch-ingest-weasyl/Cargo.toml
Normal file
27
fuzzysearch-ingest-weasyl/Cargo.toml
Normal file
@ -0,0 +1,27 @@
|
|||||||
|
[package]
|
||||||
|
name = "weasyl-watcher"
|
||||||
|
version = "0.1.0"
|
||||||
|
authors = ["Syfaro <syfaro@huefox.com>"]
|
||||||
|
edition = "2018"
|
||||||
|
|
||||||
|
[dependencies]
|
||||||
|
anyhow = "1"
|
||||||
|
|
||||||
|
reqwest = { version = "0.10", features = ["json"] }
|
||||||
|
tokio = { version = "0.2", features = ["full"] }
|
||||||
|
|
||||||
|
serde = "1"
|
||||||
|
serde_json = "1"
|
||||||
|
|
||||||
|
image = "0.23"
|
||||||
|
img_hash = "3"
|
||||||
|
|
||||||
|
sha2 = "0.9"
|
||||||
|
|
||||||
|
[dependencies.sqlx]
|
||||||
|
version = "0.4.0-beta.1"
|
||||||
|
default-features = false
|
||||||
|
features = ["runtime-tokio", "macros", "postgres", "json", "offline"]
|
||||||
|
|
||||||
|
[profile.dev.package."*"]
|
||||||
|
opt-level = 2
|
4
fuzzysearch-ingest-weasyl/Dockerfile
Normal file
4
fuzzysearch-ingest-weasyl/Dockerfile
Normal file
@ -0,0 +1,4 @@
|
|||||||
|
FROM debian:buster-slim
|
||||||
|
RUN apt-get update -y && apt-get install -y libssl-dev ca-certificates && rm -rf /var/lib/apt/lists/*
|
||||||
|
COPY ./weasyl-watcher /bin/weasyl-watcher
|
||||||
|
CMD ["/bin/weasyl-watcher"]
|
@ -0,0 +1 @@
|
|||||||
|
CREATE TABLE weasyl (id SERIAL PRIMARY KEY, hash BIGINT, sha256 BYTEA, file_size INT, data JSONB);
|
82
fuzzysearch-ingest-weasyl/sqlx-data.json
Normal file
82
fuzzysearch-ingest-weasyl/sqlx-data.json
Normal file
@ -0,0 +1,82 @@
|
|||||||
|
{
|
||||||
|
"db": "PostgreSQL",
|
||||||
|
"05da31ef5ee193d5094c6e2dc0f7cb00b4b0720a1902af02069861868f176688": {
|
||||||
|
"query": "INSERT INTO weasyl (id, hash, sha256, file_size, data) VALUES ($1, $2, $3, $4, $5)",
|
||||||
|
"describe": {
|
||||||
|
"columns": [],
|
||||||
|
"parameters": {
|
||||||
|
"Left": [
|
||||||
|
"Int4",
|
||||||
|
"Int8",
|
||||||
|
"Bytea",
|
||||||
|
"Int4",
|
||||||
|
"Jsonb"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"nullable": []
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"18a59439be1a5b6f03326ad14960fe1ada5cee94638711df99b471d86235be24": {
|
||||||
|
"query": "INSERT INTO WEASYL (id) VALUES ($1)",
|
||||||
|
"describe": {
|
||||||
|
"columns": [],
|
||||||
|
"parameters": {
|
||||||
|
"Left": [
|
||||||
|
"Int4"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"nullable": []
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"364c5c10ad748d1822c3e909aca601993f0ddb7690368a82ae467b3b0950478e": {
|
||||||
|
"query": "INSERT INTO WEASYL (id, data) VALUES ($1, $2)",
|
||||||
|
"describe": {
|
||||||
|
"columns": [],
|
||||||
|
"parameters": {
|
||||||
|
"Left": [
|
||||||
|
"Int4",
|
||||||
|
"Jsonb"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"nullable": []
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"7ef3d8fa00b1245440aae6f91bfc23bddee7730fc2de67e2f359762ce8db3bf4": {
|
||||||
|
"query": "SELECT id FROM weasyl WHERE id = $1",
|
||||||
|
"describe": {
|
||||||
|
"columns": [
|
||||||
|
{
|
||||||
|
"ordinal": 0,
|
||||||
|
"name": "id",
|
||||||
|
"type_info": "Int4"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"parameters": {
|
||||||
|
"Left": [
|
||||||
|
"Int4"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"nullable": [
|
||||||
|
false
|
||||||
|
]
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"949eca4258721007af9db04f43830bd8df525f942b6673c7a5713735ed7746d6": {
|
||||||
|
"query": "SELECT max(id) id FROM weasyl",
|
||||||
|
"describe": {
|
||||||
|
"columns": [
|
||||||
|
{
|
||||||
|
"ordinal": 0,
|
||||||
|
"name": "id",
|
||||||
|
"type_info": "Int4"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"parameters": {
|
||||||
|
"Left": []
|
||||||
|
},
|
||||||
|
"nullable": [
|
||||||
|
null
|
||||||
|
]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
213
fuzzysearch-ingest-weasyl/src/main.rs
Normal file
213
fuzzysearch-ingest-weasyl/src/main.rs
Normal file
@ -0,0 +1,213 @@
|
|||||||
|
use serde::{Deserialize, Serialize};
|
||||||
|
use sha2::{Digest, Sha256};
|
||||||
|
|
||||||
|
#[derive(Debug, Serialize, Deserialize)]
|
||||||
|
struct WeasylMediaSubmission {
|
||||||
|
#[serde(rename = "mediaid")]
|
||||||
|
id: i32,
|
||||||
|
url: String,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Serialize, Deserialize)]
|
||||||
|
struct WeasylMedia {
|
||||||
|
submission: Vec<WeasylMediaSubmission>,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Serialize, Deserialize, PartialEq)]
|
||||||
|
#[serde(rename_all = "lowercase")]
|
||||||
|
enum WeasylSubmissionSubtype {
|
||||||
|
Multimedia,
|
||||||
|
Visual,
|
||||||
|
Literary,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Serialize, Deserialize)]
|
||||||
|
struct WeasylSubmission {
|
||||||
|
#[serde(rename = "submitid")]
|
||||||
|
id: i32,
|
||||||
|
media: WeasylMedia,
|
||||||
|
subtype: WeasylSubmissionSubtype,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Serialize, Deserialize)]
|
||||||
|
struct WeasylFrontpageSubmission {
|
||||||
|
#[serde(rename = "submitid")]
|
||||||
|
id: i32,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Serialize, Deserialize)]
|
||||||
|
struct WeasylError {
|
||||||
|
name: String,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Serialize, Deserialize)]
|
||||||
|
#[serde(untagged)]
|
||||||
|
enum WeasylResponse<T> {
|
||||||
|
Error { error: WeasylError },
|
||||||
|
Response(T),
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn load_frontpage(client: &reqwest::Client, api_key: &str) -> anyhow::Result<i32> {
|
||||||
|
let resp: WeasylResponse<Vec<serde_json::Value>> = client
|
||||||
|
.get("https://www.weasyl.com/api/submissions/frontpage")
|
||||||
|
.header("X-Weasyl-API-Key", api_key)
|
||||||
|
.send()
|
||||||
|
.await?
|
||||||
|
.json()
|
||||||
|
.await?;
|
||||||
|
|
||||||
|
let subs = match resp {
|
||||||
|
WeasylResponse::Response(subs) => subs,
|
||||||
|
WeasylResponse::Error {
|
||||||
|
error: WeasylError { name },
|
||||||
|
} => return Err(anyhow::anyhow!(name)),
|
||||||
|
};
|
||||||
|
|
||||||
|
let max = subs
|
||||||
|
.into_iter()
|
||||||
|
.filter_map(|sub| sub.get("submitid").and_then(|id| id.as_i64()))
|
||||||
|
.max()
|
||||||
|
.unwrap_or_default();
|
||||||
|
|
||||||
|
Ok(max as i32)
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn load_submission(
|
||||||
|
client: &reqwest::Client,
|
||||||
|
api_key: &str,
|
||||||
|
id: i32,
|
||||||
|
) -> anyhow::Result<(Option<WeasylSubmission>, serde_json::Value)> {
|
||||||
|
println!("Loading submission {}", id);
|
||||||
|
|
||||||
|
let body: serde_json::Value = client
|
||||||
|
.get(&format!(
|
||||||
|
"https://www.weasyl.com/api/submissions/{}/view",
|
||||||
|
id
|
||||||
|
))
|
||||||
|
.header("X-Weasyl-API-Key", api_key)
|
||||||
|
.send()
|
||||||
|
.await?
|
||||||
|
.json()
|
||||||
|
.await?;
|
||||||
|
|
||||||
|
let data: WeasylResponse<WeasylSubmission> = match serde_json::from_value(body.clone()) {
|
||||||
|
Ok(data) => data,
|
||||||
|
Err(_err) => return Ok((None, body)),
|
||||||
|
};
|
||||||
|
|
||||||
|
let res = match data {
|
||||||
|
WeasylResponse::Response(sub) if sub.subtype == WeasylSubmissionSubtype::Visual => {
|
||||||
|
Some(sub)
|
||||||
|
}
|
||||||
|
WeasylResponse::Response(_sub) => None,
|
||||||
|
WeasylResponse::Error {
|
||||||
|
error: WeasylError { name },
|
||||||
|
} if name == "submissionRecordMissing" => None,
|
||||||
|
WeasylResponse::Error {
|
||||||
|
error: WeasylError { name },
|
||||||
|
} => return Err(anyhow::anyhow!(name)),
|
||||||
|
};
|
||||||
|
|
||||||
|
Ok((res, body))
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn process_submission(
|
||||||
|
pool: &sqlx::Pool<sqlx::Postgres>,
|
||||||
|
client: &reqwest::Client,
|
||||||
|
body: serde_json::Value,
|
||||||
|
sub: WeasylSubmission,
|
||||||
|
) -> anyhow::Result<()> {
|
||||||
|
println!("Processing submission {}", sub.id);
|
||||||
|
|
||||||
|
let data = client
|
||||||
|
.get(&sub.media.submission.first().unwrap().url)
|
||||||
|
.send()
|
||||||
|
.await?
|
||||||
|
.bytes()
|
||||||
|
.await?;
|
||||||
|
|
||||||
|
let num = if let Ok(image) = image::load_from_memory(&data) {
|
||||||
|
let hasher = img_hash::HasherConfig::with_bytes_type::<[u8; 8]>()
|
||||||
|
.hash_alg(img_hash::HashAlg::Gradient)
|
||||||
|
.hash_size(8, 8)
|
||||||
|
.preproc_dct()
|
||||||
|
.to_hasher();
|
||||||
|
let hash = hasher.hash_image(&image);
|
||||||
|
let mut bytes: [u8; 8] = [0; 8];
|
||||||
|
bytes.copy_from_slice(hash.as_bytes());
|
||||||
|
let num = i64::from_be_bytes(bytes);
|
||||||
|
Some(num)
|
||||||
|
} else {
|
||||||
|
println!("Unable to decode image on submission {}", sub.id);
|
||||||
|
|
||||||
|
None
|
||||||
|
};
|
||||||
|
|
||||||
|
let mut hasher = Sha256::new();
|
||||||
|
hasher.update(&data);
|
||||||
|
let result: [u8; 32] = hasher.finalize().into();
|
||||||
|
|
||||||
|
sqlx::query!(
|
||||||
|
"INSERT INTO weasyl (id, hash, sha256, file_size, data) VALUES ($1, $2, $3, $4, $5)",
|
||||||
|
sub.id,
|
||||||
|
num,
|
||||||
|
result.to_vec(),
|
||||||
|
data.len() as i32,
|
||||||
|
body
|
||||||
|
)
|
||||||
|
.execute(pool)
|
||||||
|
.await?;
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn insert_null(
|
||||||
|
pool: &sqlx::Pool<sqlx::Postgres>,
|
||||||
|
body: serde_json::Value,
|
||||||
|
id: i32,
|
||||||
|
) -> anyhow::Result<()> {
|
||||||
|
println!("Inserting null for submission {}", id);
|
||||||
|
|
||||||
|
sqlx::query!("INSERT INTO WEASYL (id, data) VALUES ($1, $2)", id, body)
|
||||||
|
.execute(pool)
|
||||||
|
.await?;
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
#[tokio::main]
|
||||||
|
async fn main() {
|
||||||
|
let api_key = std::env::var("WEASYL_APIKEY").unwrap();
|
||||||
|
|
||||||
|
let pool = sqlx::postgres::PgPoolOptions::new()
|
||||||
|
.max_connections(2)
|
||||||
|
.connect(&std::env::var("DATABASE_URL").unwrap())
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
|
let client = reqwest::Client::new();
|
||||||
|
|
||||||
|
let min = sqlx::query!("SELECT max(id) id FROM weasyl")
|
||||||
|
.fetch_one(&pool)
|
||||||
|
.await
|
||||||
|
.unwrap()
|
||||||
|
.id
|
||||||
|
.unwrap_or_default();
|
||||||
|
|
||||||
|
let max = load_frontpage(&client, &api_key).await.unwrap();
|
||||||
|
|
||||||
|
for id in (min + 1)..=max {
|
||||||
|
let row: Option<_> = sqlx::query!("SELECT id FROM weasyl WHERE id = $1", id)
|
||||||
|
.fetch_optional(&pool)
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
if row.is_some() {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
match load_submission(&client, &api_key, id).await.unwrap() {
|
||||||
|
(Some(sub), json) => process_submission(&pool, &client, json, sub).await.unwrap(),
|
||||||
|
(None, body) => insert_null(&pool, body, id).await.unwrap(),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
Loading…
Reference in New Issue
Block a user