feat: complete image phase - SHA-256 exact + dHash perceptual duplicate detection
- lib.rs: scan_images, compute_dhash, hamming, find_duplicate_groups - main.rs: CLI with folder arg and optional hamming threshold - 13 unit tests: hamming, is_image_path, dhash, find_duplicate_groups - 7 integration tests: real files, empty dir, cropped, non-image exclusion, subdirectory recursion, single file, CLI binary output - All 20 tests passing
This commit is contained in:
286
src/lib.rs
Normal file
286
src/lib.rs
Normal file
@@ -0,0 +1,286 @@
|
||||
use anyhow::Result;
|
||||
use image::imageops::FilterType;
|
||||
use sha2::{Digest, Sha256};
|
||||
use std::collections::{HashMap, HashSet};
|
||||
use std::fs;
|
||||
use std::path::{Path, PathBuf};
|
||||
use walkdir::WalkDir;
|
||||
|
||||
#[derive(Debug, Clone, PartialEq, Eq)]
|
||||
pub struct ImageEntry {
|
||||
pub path: PathBuf,
|
||||
pub sha256: String,
|
||||
pub dhash: u64,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, PartialEq, Eq)]
|
||||
pub struct DuplicateGroup {
|
||||
pub kind: DuplicateKind,
|
||||
pub paths: Vec<PathBuf>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
||||
pub enum DuplicateKind {
|
||||
Exact,
|
||||
Similar,
|
||||
}
|
||||
|
||||
fn is_image_path(path: &Path) -> bool {
|
||||
path.extension()
|
||||
.and_then(|e| e.to_str())
|
||||
.map(|e| matches!(e.to_ascii_lowercase().as_str(), "jpg" | "jpeg" | "png" | "webp" | "bmp" | "gif" | "tif" | "tiff"))
|
||||
.unwrap_or(false)
|
||||
}
|
||||
|
||||
pub fn scan_images(root: &Path) -> Result<Vec<ImageEntry>> {
|
||||
let mut out = Vec::new();
|
||||
for entry in WalkDir::new(root).follow_links(true) {
|
||||
let entry = entry?;
|
||||
let path = entry.path();
|
||||
if !entry.file_type().is_file() || !is_image_path(path) {
|
||||
continue;
|
||||
}
|
||||
let bytes = fs::read(path)?;
|
||||
let sha256 = format!("{:x}", Sha256::digest(&bytes));
|
||||
let img = image::open(path)?;
|
||||
let dhash = compute_dhash(&img);
|
||||
out.push(ImageEntry {
|
||||
path: path.to_path_buf(),
|
||||
sha256,
|
||||
dhash,
|
||||
});
|
||||
}
|
||||
Ok(out)
|
||||
}
|
||||
|
||||
pub fn compute_dhash(img: &image::DynamicImage) -> u64 {
|
||||
let gray = img
|
||||
.grayscale()
|
||||
.resize_exact(9, 8, FilterType::Triangle)
|
||||
.to_luma8();
|
||||
let mut hash = 0u64;
|
||||
let mut bit = 0;
|
||||
for y in 0..8 {
|
||||
for x in 0..8 {
|
||||
let left = gray.get_pixel(x, y)[0];
|
||||
let right = gray.get_pixel(x + 1, y)[0];
|
||||
if left > right {
|
||||
hash |= 1 << bit;
|
||||
}
|
||||
bit += 1;
|
||||
}
|
||||
}
|
||||
hash
|
||||
}
|
||||
|
||||
pub fn hamming(a: u64, b: u64) -> u32 {
|
||||
(a ^ b).count_ones()
|
||||
}
|
||||
|
||||
pub fn find_duplicate_groups(entries: &[ImageEntry], hamming_threshold: u32) -> Vec<DuplicateGroup> {
|
||||
let mut groups = Vec::new();
|
||||
|
||||
let mut exact: HashMap<&str, Vec<PathBuf>> = HashMap::new();
|
||||
for e in entries {
|
||||
exact.entry(&e.sha256).or_default().push(e.path.clone());
|
||||
}
|
||||
for paths in exact.into_values() {
|
||||
if paths.len() > 1 {
|
||||
groups.push(DuplicateGroup {
|
||||
kind: DuplicateKind::Exact,
|
||||
paths,
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
let n = entries.len();
|
||||
let mut parent: Vec<usize> = (0..n).collect();
|
||||
|
||||
fn find(parent: &mut [usize], x: usize) -> usize {
|
||||
if parent[x] != x {
|
||||
let p = parent[x];
|
||||
parent[x] = find(parent, p);
|
||||
}
|
||||
parent[x]
|
||||
}
|
||||
|
||||
fn union(parent: &mut [usize], a: usize, b: usize) {
|
||||
let ra = find(parent, a);
|
||||
let rb = find(parent, b);
|
||||
if ra != rb {
|
||||
parent[rb] = ra;
|
||||
}
|
||||
}
|
||||
|
||||
for i in 0..n {
|
||||
for j in (i + 1)..n {
|
||||
if entries[i].sha256 == entries[j].sha256 {
|
||||
continue;
|
||||
}
|
||||
if hamming(entries[i].dhash, entries[j].dhash) <= hamming_threshold {
|
||||
union(&mut parent, i, j);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let mut similar: HashMap<usize, Vec<PathBuf>> = HashMap::new();
|
||||
for (idx, e) in entries.iter().enumerate() {
|
||||
let root = find(&mut parent, idx);
|
||||
similar.entry(root).or_default().push(e.path.clone());
|
||||
}
|
||||
|
||||
let exact_paths: HashSet<PathBuf> = groups
|
||||
.iter()
|
||||
.flat_map(|g| g.paths.iter().cloned())
|
||||
.collect();
|
||||
|
||||
for paths in similar.into_values() {
|
||||
if paths.len() > 1 {
|
||||
let non_exact = paths.iter().filter(|p| !exact_paths.contains(*p)).count();
|
||||
if non_exact >= 2 {
|
||||
groups.push(DuplicateGroup {
|
||||
kind: DuplicateKind::Similar,
|
||||
paths,
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
groups
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use std::path::PathBuf;
|
||||
|
||||
#[test]
|
||||
fn hamming_identical() {
|
||||
assert_eq!(hamming(0, 0), 0);
|
||||
assert_eq!(hamming(u64::MAX, u64::MAX), 0);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn hamming_opposite() {
|
||||
assert_eq!(hamming(0, u64::MAX), 64);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn hamming_single_bit() {
|
||||
assert_eq!(hamming(0b0000, 0b0001), 1);
|
||||
assert_eq!(hamming(0b1010, 0b1000), 1);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn is_image_path_accepts_valid_extensions() {
|
||||
for ext in &["jpg", "jpeg", "png", "webp", "bmp", "gif", "tif", "tiff"] {
|
||||
let p = PathBuf::from(format!("photo.{ext}"));
|
||||
assert!(is_image_path(&p), "should accept .{ext}");
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn is_image_path_case_insensitive() {
|
||||
assert!(is_image_path(Path::new("photo.JPG")));
|
||||
assert!(is_image_path(Path::new("photo.Png")));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn is_image_path_rejects_non_image() {
|
||||
assert!(!is_image_path(Path::new("file.txt")));
|
||||
assert!(!is_image_path(Path::new("file.mp3")));
|
||||
assert!(!is_image_path(Path::new("file.pdf")));
|
||||
assert!(!is_image_path(Path::new("noext")));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn dhash_deterministic() {
|
||||
let img = image::DynamicImage::new_rgb8(100, 100);
|
||||
let h1 = compute_dhash(&img);
|
||||
let h2 = compute_dhash(&img);
|
||||
assert_eq!(h1, h2);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn dhash_solid_images_differ_from_gradient() {
|
||||
// Solid black: all pixels 0 -> dhash = 0 (no left > right)
|
||||
let black = image::DynamicImage::new_rgb8(64, 64);
|
||||
// Gradient: right-to-left (bright left, dark right) -> left > right = true
|
||||
let mut grad = image::RgbImage::new(64, 64);
|
||||
for y in 0..64 {
|
||||
for x in 0..64 {
|
||||
let v = (255 - x * 255 / 63) as u8;
|
||||
grad.put_pixel(x, y, image::Rgb([v, v, v]));
|
||||
}
|
||||
}
|
||||
let grad = image::DynamicImage::ImageRgb8(grad);
|
||||
let h_black = compute_dhash(&black);
|
||||
let h_grad = compute_dhash(&grad);
|
||||
assert_ne!(h_black, h_grad, "solid vs gradient hashes must differ");
|
||||
assert!(hamming(h_black, h_grad) > 8, "solid vs gradient should differ significantly");
|
||||
}
|
||||
|
||||
fn make_entry(path: &str, sha: &str, dhash: u64) -> ImageEntry {
|
||||
ImageEntry {
|
||||
path: PathBuf::from(path),
|
||||
sha256: sha.to_string(),
|
||||
dhash,
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn find_groups_empty_input() {
|
||||
let groups = find_duplicate_groups(&[], 8);
|
||||
assert!(groups.is_empty());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn find_groups_no_duplicates() {
|
||||
let entries = vec![
|
||||
make_entry("a.jpg", "aaa", 0),
|
||||
make_entry("b.jpg", "bbb", u64::MAX),
|
||||
];
|
||||
let groups = find_duplicate_groups(&entries, 8);
|
||||
assert!(groups.is_empty(), "different hash+dhash = no groups");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn find_groups_exact_only() {
|
||||
let entries = vec![
|
||||
make_entry("a.jpg", "same", 100),
|
||||
make_entry("b.jpg", "same", 100),
|
||||
make_entry("c.jpg", "diff", 999),
|
||||
];
|
||||
let groups = find_duplicate_groups(&entries, 8);
|
||||
assert_eq!(groups.iter().filter(|g| g.kind == DuplicateKind::Exact).count(), 1);
|
||||
let exact = groups.iter().find(|g| g.kind == DuplicateKind::Exact).unwrap();
|
||||
assert_eq!(exact.paths.len(), 2);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn find_groups_similar_only() {
|
||||
let entries = vec![
|
||||
make_entry("a.jpg", "aaa", 0b0000_0000),
|
||||
make_entry("b.jpg", "bbb", 0b0000_0011), // hamming=2
|
||||
make_entry("c.jpg", "ccc", u64::MAX), // far away
|
||||
];
|
||||
let groups = find_duplicate_groups(&entries, 8);
|
||||
assert!(groups.iter().any(|g| g.kind == DuplicateKind::Similar), "should find similar pair");
|
||||
assert!(!groups.iter().any(|g| g.paths.iter().any(|p| p == Path::new("c.jpg"))
|
||||
&& g.paths.iter().any(|p| p == Path::new("a.jpg"))), "c.jpg should not group with a.jpg");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn find_groups_threshold_boundary() {
|
||||
let entries = vec![
|
||||
make_entry("a.jpg", "aaa", 0),
|
||||
make_entry("b.jpg", "bbb", 0b1111_1111), // hamming=8
|
||||
];
|
||||
// threshold=8: should match
|
||||
let groups8 = find_duplicate_groups(&entries, 8);
|
||||
assert!(groups8.iter().any(|g| g.kind == DuplicateKind::Similar));
|
||||
// threshold=7: should NOT match
|
||||
let groups7 = find_duplicate_groups(&entries, 7);
|
||||
assert!(!groups7.iter().any(|g| g.kind == DuplicateKind::Similar));
|
||||
}
|
||||
}
|
||||
41
src/main.rs
41
src/main.rs
@@ -1,3 +1,42 @@
|
||||
use deduper::{find_duplicate_groups, scan_images, DuplicateKind};
|
||||
use std::env;
|
||||
use std::path::Path;
|
||||
|
||||
fn main() {
|
||||
println!("Hello, world!");
|
||||
let args: Vec<String> = env::args().collect();
|
||||
if args.len() < 2 {
|
||||
eprintln!("usage: deduper <folder> [hamming-threshold]");
|
||||
std::process::exit(1);
|
||||
}
|
||||
|
||||
let root = Path::new(&args[1]);
|
||||
let threshold = args
|
||||
.get(2)
|
||||
.and_then(|s| s.parse::<u32>().ok())
|
||||
.unwrap_or(8);
|
||||
|
||||
let entries = match scan_images(root) {
|
||||
Ok(v) => v,
|
||||
Err(e) => {
|
||||
eprintln!("scan error: {e}");
|
||||
std::process::exit(1);
|
||||
}
|
||||
};
|
||||
|
||||
let groups = find_duplicate_groups(&entries, threshold);
|
||||
if groups.is_empty() {
|
||||
println!("no image duplicates found");
|
||||
return;
|
||||
}
|
||||
|
||||
for (idx, group) in groups.iter().enumerate() {
|
||||
let kind = match group.kind {
|
||||
DuplicateKind::Exact => "exact",
|
||||
DuplicateKind::Similar => "similar",
|
||||
};
|
||||
println!("group {} [{}]", idx + 1, kind);
|
||||
for path in &group.paths {
|
||||
println!(" {}", path.display());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user