Add methods to extract hashes from a GIF or video.

This commit is contained in:
Syfaro 2020-12-06 20:02:01 -06:00
parent f6319e6d90
commit 9eb653ce6b
7 changed files with 355 additions and 16 deletions

View File

@ -8,6 +8,12 @@ platform:
arch: amd64
steps:
- name: test
image: rust:1-slim
commands:
- cargo build
- cargo test
- name: build-latest
image: plugins/docker
settings:
@ -22,19 +28,4 @@ steps:
branch:
- master
- name: build-branch
image: plugins/docker
settings:
password:
from_secret: docker_password
registry: registry.huefox.com
repo: registry.huefox.com/fuzzysearch
tags: ${DRONE_BRANCH}
username:
from_secret: docker_username
when:
branch:
exclude:
- master
...

136
Cargo.lock generated
View File

@ -21,6 +21,12 @@ dependencies = [
"winapi 0.3.9",
]
[[package]]
name = "anyhow"
version = "1.0.35"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2c0df63cb2955042487fad3aefd2c6e3ae7389ac5dc1beb28921de0b69f779d4"
[[package]]
name = "async-trait"
version = "0.1.42"
@ -87,6 +93,26 @@ dependencies = [
"tokio-postgres",
]
[[package]]
name = "bindgen"
version = "0.54.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "66c0bb6167449588ff70803f4127f0684f9063097eca5016f37eb52b92c2cf36"
dependencies = [
"bitflags",
"cexpr",
"cfg-if 0.1.10",
"clang-sys",
"lazy_static",
"lazycell",
"peeking_take_while",
"proc-macro2",
"quote",
"regex",
"rustc-hash",
"shlex",
]
[[package]]
name = "bitflags"
version = "1.2.1"
@ -175,6 +201,15 @@ version = "1.0.66"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4c0496836a84f8d0495758516b8621a622beb77c0fed418570e50764093ced48"
[[package]]
name = "cexpr"
version = "0.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f4aedb84272dbe89af497cf81375129abda4fc0a9e7c5d317498c15cc30c0d27"
dependencies = [
"nom",
]
[[package]]
name = "cfg-if"
version = "0.1.10"
@ -200,6 +235,17 @@ dependencies = [
"winapi 0.3.9",
]
[[package]]
name = "clang-sys"
version = "0.29.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "fe6837df1d5cba2397b835c8530f51723267e16abbf83892e9e5af4f0e5dd10a"
dependencies = [
"glob",
"libc",
"libloading",
]
[[package]]
name = "cloudabi"
version = "0.0.3"
@ -344,6 +390,31 @@ version = "0.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4443176a9f2c162692bd3d352d745ef9413eec5782a80d8fd6f8a1ac692a07f7"
[[package]]
name = "ffmpeg-next"
version = "4.3.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5e17e735bf446b8e57b794fcb5841106817e890de40275dfad367493a752c3e9"
dependencies = [
"bitflags",
"ffmpeg-sys-next",
"libc",
]
[[package]]
name = "ffmpeg-sys-next"
version = "4.3.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2fde8cbf91a1b044b86d9e9e944c33806a68f5e34e4281033594ceaab47a3746"
dependencies = [
"bindgen",
"cc",
"libc",
"num_cpus",
"pkg-config",
"vcpkg",
]
[[package]]
name = "fnv"
version = "1.0.7"
@ -481,11 +552,13 @@ dependencies = [
name = "fuzzysearch"
version = "0.1.0"
dependencies = [
"anyhow",
"bb8",
"bb8-postgres",
"bk-tree",
"bytes 0.5.6",
"chrono",
"ffmpeg-next",
"futures",
"futures-util",
"hamming",
@ -494,6 +567,7 @@ dependencies = [
"opentelemetry",
"opentelemetry-jaeger",
"serde",
"tempfile",
"tokio 0.3.5",
"tokio-postgres",
"tracing",
@ -556,6 +630,12 @@ dependencies = [
"weezl",
]
[[package]]
name = "glob"
version = "0.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9b919933a397b79c37e33b77bb2aa3dc8eb6e165ad809e58ff75bc7db2e34574"
[[package]]
name = "h2"
version = "0.2.7"
@ -807,12 +887,28 @@ version = "1.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646"
[[package]]
name = "lazycell"
version = "1.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "830d08ce1d1d941e6b30645f1a0eb5643013d835ce3779a5fc208261dbe10f55"
[[package]]
name = "libc"
version = "0.2.81"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1482821306169ec4d07f6aca392a4681f66c75c9918aa49641a2595db64053cb"
[[package]]
name = "libloading"
version = "0.5.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f2b111a074963af1d37a139918ac6d49ad1d0d5e47f72fd55388619691a7d753"
dependencies = [
"cc",
"winapi 0.3.9",
]
[[package]]
name = "lock_api"
version = "0.4.2"
@ -998,6 +1094,16 @@ dependencies = [
"winapi 0.3.9",
]
[[package]]
name = "nom"
version = "5.1.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ffb4262d26ed83a1c0a33a38fe2bb15797329c85770da05e6b828ddb782627af"
dependencies = [
"memchr",
"version_check",
]
[[package]]
name = "ntapi"
version = "0.3.6"
@ -1144,6 +1250,12 @@ dependencies = [
"winapi 0.3.9",
]
[[package]]
name = "peeking_take_while"
version = "0.1.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "19b17cddbe7ec3f8bc800887bab5e717348c95ea2ca0b1bf0837fb964dc67099"
[[package]]
name = "percent-encoding"
version = "2.1.0"
@ -1226,6 +1338,12 @@ version = "0.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184"
[[package]]
name = "pkg-config"
version = "0.3.19"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3831453b3449ceb48b6d9c7ad7c96d5ea673e9b470a1dc578c2ce6521230884c"
[[package]]
name = "png"
version = "0.16.7"
@ -1550,6 +1668,12 @@ dependencies = [
"winapi 0.3.9",
]
[[package]]
name = "rustc-hash"
version = "1.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "08d43f7aa6b08d49f382cde6a7982047c3426db949b1424bc4b7ec9ae12c6ce2"
[[package]]
name = "rustc_version"
version = "0.2.3"
@ -1717,6 +1841,12 @@ dependencies = [
"loom",
]
[[package]]
name = "shlex"
version = "0.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7fdf1b9db47230893d76faad238fd6097fd6d6a9245cd7a4d90dbd639536bbd2"
[[package]]
name = "siphasher"
version = "0.3.3"
@ -2198,6 +2328,12 @@ version = "0.7.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "05e42f7c18b8f902290b009cde6d651262f956c98bc51bca4cd1d511c9cd85c7"
[[package]]
name = "vcpkg"
version = "0.2.10"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6454029bf181f092ad1b853286f23e2c507d8e8194d01d92da4a55c274a5508c"
[[package]]
name = "version_check"
version = "0.9.2"

View File

@ -17,8 +17,10 @@ tokio = { version = "0.3", features = ["macros", "rt-multi-thread", "sync"] }
futures = "0.3"
futures-util = "0.3"
anyhow = "1"
chrono = "0.4"
bytes = "0.5"
tempfile = "3"
serde = { version = "1", features = ["derive"] }
warp = "0.2"
@ -27,8 +29,10 @@ tokio-postgres = "0.6"
bb8 = "0.6"
bb8-postgres = "0.6"
img_hash = "3"
image = "0.23"
ffmpeg-next = "4"
img_hash = "3"
hamming = "0.1"
bk-tree = "0.3"

View File

@ -9,6 +9,7 @@ mod handlers;
mod models;
mod types;
mod utils;
mod video;
use warp::Filter;
@ -89,6 +90,8 @@ impl bk_tree::Metric<Node> for Hamming {
#[tokio::main]
async fn main() {
ffmpeg_next::init().expect("Unable to initialize ffmpeg");
configure_tracing();
let s = std::env::var("POSTGRES_DSN").expect("Missing POSTGRES_DSN");

205
src/video.rs Normal file
View File

@ -0,0 +1,205 @@
use std::convert::TryInto;
use std::io::Read;
use ffmpeg_next::{
format::{input, Pixel},
media::Type as MediaType,
software::scaling::{context::Context, Flags as ScalingFlags},
util::frame::Video,
};
use image::{gif::GifDecoder, AnimationDecoder};
use tempfile::NamedTempFile;
use crate::get_hasher;
/// Extract frames of a GIF into individual images and calculate a hash for each
/// frame. Results are kept in the same order as seen in the GIF.
///
/// This is a blocking function.
#[tracing::instrument(skip(r))]
pub fn extract_gif_hashes<R: Read>(r: R) -> Result<Vec<[u8; 8]>, image::ImageError> {
let hasher = crate::get_hasher();
// Begin by creating a new GifDecoder from our reader. Collect all frames
// from the GIF.
//
// FUTURE: profile memory usage of collecting all frames instead of iterating
let decoder = GifDecoder::new(r)?;
let frames = decoder.into_frames().collect_frames()?;
tracing::trace!(frames = frames.len(), "Collected GIF frames");
// Allocate a Vec to hold all our hashes.
let mut hashes = Vec::with_capacity(frames.len());
// For each frame, get an ImageBuffer, hash the image, and append bytes into
// the results.
//
// FUTURE: should this be parallelized?
for frame in frames {
let buf = frame.buffer();
let hash = hasher.hash_image(buf);
let bytes = hash.as_bytes().try_into().unwrap();
hashes.push(bytes);
}
Ok(hashes)
}
/// Write the contents of `r` into a temporary file and return the handle to
/// that file. This file should automatically be deleted when the handle is
/// dropped.
///
/// This is a blocking function.
fn write_temp_file<R: Read>(mut r: R) -> std::io::Result<NamedTempFile> {
let mut f = NamedTempFile::new()?;
std::io::copy(&mut r, &mut f)?;
Ok(f)
}
/// Extract frames of a video into individual images and calculate a hash for
/// each frame. Results are kept in the same order as seen in the input.
///
/// This is a blocking function.
#[tracing::instrument(skip(r))]
pub fn extract_video_hashes<R: Read>(r: R) -> anyhow::Result<Vec<[u8; 8]>> {
let f = write_temp_file(r)?;
// Create an input context from the given path.
//
// TODO: figure out if there's a way to provide data without creating a file
let mut ictx = input(&f.path())?;
// Select the best video stream and find it's index.
let input = ictx
.streams()
.best(MediaType::Video)
.ok_or(ffmpeg_next::Error::StreamNotFound)?;
let stream_index = input.index();
// Create a new decoder that outputs 8-bit RGB colors with the same
// dimensions as the source.
let mut decoder = input.codec().decoder().video()?;
let mut scaler = Context::get(
decoder.format(),
decoder.width(),
decoder.height(),
Pixel::RGB24,
decoder.width(),
decoder.height(),
ScalingFlags::BILINEAR,
)?;
tracing::trace!("Initialized ffmpeg with video input");
let mut hashes: Vec<[u8; 8]> = Vec::new();
let hasher = get_hasher();
// Callback function run for each packet loaded by ffmpeg. It's responsible
// for processing each frame into a hash and storing it.
let mut receive_and_process_decoded_frames =
|decoder: &mut ffmpeg_next::decoder::Video| -> Result<(), ffmpeg_next::Error> {
let mut decoded = Video::empty();
while decoder.receive_frame(&mut decoded).is_ok() {
// Create a frame buffer and decode data into it.
let mut rgb_frame = Video::empty();
scaler.run(&decoded, &mut rgb_frame)?;
// Convert raw data into an RgbImage for use with image hashing.
let data = rgb_frame.data(0).to_vec();
let im: image::RgbImage =
image::ImageBuffer::from_raw(decoder.width(), decoder.height(), data)
.expect("Image frame data was invalid");
// Hash frame, convert to [u8; 8].
let hash = hasher.hash_image(&im);
let hash = hash.as_bytes();
hashes.push(
hash.try_into()
.expect("img_hash provided incorrect number of bytes"),
);
}
Ok(())
};
// Now that we've set up our callback, iterate through file packets, decode
// them, and send to our callback for processing.
for (stream, packet) in ictx.packets() {
if stream.index() != stream_index {
continue;
}
decoder.send_packet(&packet)?;
receive_and_process_decoded_frames(&mut decoder)?;
}
// Make sure all data has been processed with EOF.
decoder.send_eof()?;
receive_and_process_decoded_frames(&mut decoder)?;
Ok(hashes)
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_extract_gif_hashes() -> anyhow::Result<()> {
use std::fs::File;
let gif = File::open("tests/fox.gif")?;
let hashes = extract_gif_hashes(&gif)?;
assert_eq!(
hashes.len(),
47,
"GIF did not have expected number of hashes"
);
assert_eq!(
hashes[0],
[154, 64, 160, 169, 170, 53, 181, 221],
"First frame had different hash"
);
assert_eq!(
hashes[1],
[154, 64, 160, 169, 170, 53, 53, 221],
"Second frame had different hash"
);
Ok(())
}
#[test]
fn test_extract_video_hashes() -> anyhow::Result<()> {
use std::fs::File;
let video = File::open("tests/video.webm")?;
let hashes = extract_video_hashes(&video)?;
assert_eq!(
hashes.len(),
126,
"Video did not have expected number of hashes"
);
assert_eq!(
hashes[0],
[60, 166, 75, 61, 48, 166, 73, 205],
"First frame had different hash"
);
assert_eq!(
hashes[1],
[60, 166, 75, 61, 48, 166, 73, 205],
"Second frame had different hash"
);
Ok(())
}
}

BIN
tests/fox.gif Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 1.1 MiB

BIN
tests/video.webm Normal file

Binary file not shown.