feat: complete image phase - SHA-256 exact + dHash perceptual duplicate detection
- lib.rs: scan_images, compute_dhash, hamming, find_duplicate_groups - main.rs: CLI with folder arg and optional hamming threshold - 13 unit tests: hamming, is_image_path, dhash, find_duplicate_groups - 7 integration tests: real files, empty dir, cropped, non-image exclusion, subdirectory recursion, single file, CLI binary output - All 20 tests passing
This commit is contained in:
1
.gitignore
vendored
1
.gitignore
vendored
@@ -1 +1,2 @@
|
||||
/target
|
||||
.ssh/
|
||||
|
||||
1101
Cargo.lock
generated
Normal file
1101
Cargo.lock
generated
Normal file
File diff suppressed because it is too large
Load Diff
@@ -4,3 +4,7 @@ version = "0.1.0"
|
||||
edition = "2024"
|
||||
|
||||
[dependencies]
|
||||
image = { version = "0.25", default-features = true, features = ["jpeg", "png", "gif", "webp", "bmp", "tiff"] }
|
||||
sha2 = "0.10"
|
||||
walkdir = "2.5"
|
||||
anyhow = "1"
|
||||
|
||||
286
src/lib.rs
Normal file
286
src/lib.rs
Normal file
@@ -0,0 +1,286 @@
|
||||
use anyhow::Result;
|
||||
use image::imageops::FilterType;
|
||||
use sha2::{Digest, Sha256};
|
||||
use std::collections::{HashMap, HashSet};
|
||||
use std::fs;
|
||||
use std::path::{Path, PathBuf};
|
||||
use walkdir::WalkDir;
|
||||
|
||||
#[derive(Debug, Clone, PartialEq, Eq)]
|
||||
pub struct ImageEntry {
|
||||
pub path: PathBuf,
|
||||
pub sha256: String,
|
||||
pub dhash: u64,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, PartialEq, Eq)]
|
||||
pub struct DuplicateGroup {
|
||||
pub kind: DuplicateKind,
|
||||
pub paths: Vec<PathBuf>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
||||
pub enum DuplicateKind {
|
||||
Exact,
|
||||
Similar,
|
||||
}
|
||||
|
||||
fn is_image_path(path: &Path) -> bool {
|
||||
path.extension()
|
||||
.and_then(|e| e.to_str())
|
||||
.map(|e| matches!(e.to_ascii_lowercase().as_str(), "jpg" | "jpeg" | "png" | "webp" | "bmp" | "gif" | "tif" | "tiff"))
|
||||
.unwrap_or(false)
|
||||
}
|
||||
|
||||
pub fn scan_images(root: &Path) -> Result<Vec<ImageEntry>> {
|
||||
let mut out = Vec::new();
|
||||
for entry in WalkDir::new(root).follow_links(true) {
|
||||
let entry = entry?;
|
||||
let path = entry.path();
|
||||
if !entry.file_type().is_file() || !is_image_path(path) {
|
||||
continue;
|
||||
}
|
||||
let bytes = fs::read(path)?;
|
||||
let sha256 = format!("{:x}", Sha256::digest(&bytes));
|
||||
let img = image::open(path)?;
|
||||
let dhash = compute_dhash(&img);
|
||||
out.push(ImageEntry {
|
||||
path: path.to_path_buf(),
|
||||
sha256,
|
||||
dhash,
|
||||
});
|
||||
}
|
||||
Ok(out)
|
||||
}
|
||||
|
||||
pub fn compute_dhash(img: &image::DynamicImage) -> u64 {
|
||||
let gray = img
|
||||
.grayscale()
|
||||
.resize_exact(9, 8, FilterType::Triangle)
|
||||
.to_luma8();
|
||||
let mut hash = 0u64;
|
||||
let mut bit = 0;
|
||||
for y in 0..8 {
|
||||
for x in 0..8 {
|
||||
let left = gray.get_pixel(x, y)[0];
|
||||
let right = gray.get_pixel(x + 1, y)[0];
|
||||
if left > right {
|
||||
hash |= 1 << bit;
|
||||
}
|
||||
bit += 1;
|
||||
}
|
||||
}
|
||||
hash
|
||||
}
|
||||
|
||||
pub fn hamming(a: u64, b: u64) -> u32 {
|
||||
(a ^ b).count_ones()
|
||||
}
|
||||
|
||||
pub fn find_duplicate_groups(entries: &[ImageEntry], hamming_threshold: u32) -> Vec<DuplicateGroup> {
|
||||
let mut groups = Vec::new();
|
||||
|
||||
let mut exact: HashMap<&str, Vec<PathBuf>> = HashMap::new();
|
||||
for e in entries {
|
||||
exact.entry(&e.sha256).or_default().push(e.path.clone());
|
||||
}
|
||||
for paths in exact.into_values() {
|
||||
if paths.len() > 1 {
|
||||
groups.push(DuplicateGroup {
|
||||
kind: DuplicateKind::Exact,
|
||||
paths,
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
let n = entries.len();
|
||||
let mut parent: Vec<usize> = (0..n).collect();
|
||||
|
||||
fn find(parent: &mut [usize], x: usize) -> usize {
|
||||
if parent[x] != x {
|
||||
let p = parent[x];
|
||||
parent[x] = find(parent, p);
|
||||
}
|
||||
parent[x]
|
||||
}
|
||||
|
||||
fn union(parent: &mut [usize], a: usize, b: usize) {
|
||||
let ra = find(parent, a);
|
||||
let rb = find(parent, b);
|
||||
if ra != rb {
|
||||
parent[rb] = ra;
|
||||
}
|
||||
}
|
||||
|
||||
for i in 0..n {
|
||||
for j in (i + 1)..n {
|
||||
if entries[i].sha256 == entries[j].sha256 {
|
||||
continue;
|
||||
}
|
||||
if hamming(entries[i].dhash, entries[j].dhash) <= hamming_threshold {
|
||||
union(&mut parent, i, j);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let mut similar: HashMap<usize, Vec<PathBuf>> = HashMap::new();
|
||||
for (idx, e) in entries.iter().enumerate() {
|
||||
let root = find(&mut parent, idx);
|
||||
similar.entry(root).or_default().push(e.path.clone());
|
||||
}
|
||||
|
||||
let exact_paths: HashSet<PathBuf> = groups
|
||||
.iter()
|
||||
.flat_map(|g| g.paths.iter().cloned())
|
||||
.collect();
|
||||
|
||||
for paths in similar.into_values() {
|
||||
if paths.len() > 1 {
|
||||
let non_exact = paths.iter().filter(|p| !exact_paths.contains(*p)).count();
|
||||
if non_exact >= 2 {
|
||||
groups.push(DuplicateGroup {
|
||||
kind: DuplicateKind::Similar,
|
||||
paths,
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
groups
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use std::path::PathBuf;
|
||||
|
||||
#[test]
|
||||
fn hamming_identical() {
|
||||
assert_eq!(hamming(0, 0), 0);
|
||||
assert_eq!(hamming(u64::MAX, u64::MAX), 0);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn hamming_opposite() {
|
||||
assert_eq!(hamming(0, u64::MAX), 64);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn hamming_single_bit() {
|
||||
assert_eq!(hamming(0b0000, 0b0001), 1);
|
||||
assert_eq!(hamming(0b1010, 0b1000), 1);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn is_image_path_accepts_valid_extensions() {
|
||||
for ext in &["jpg", "jpeg", "png", "webp", "bmp", "gif", "tif", "tiff"] {
|
||||
let p = PathBuf::from(format!("photo.{ext}"));
|
||||
assert!(is_image_path(&p), "should accept .{ext}");
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn is_image_path_case_insensitive() {
|
||||
assert!(is_image_path(Path::new("photo.JPG")));
|
||||
assert!(is_image_path(Path::new("photo.Png")));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn is_image_path_rejects_non_image() {
|
||||
assert!(!is_image_path(Path::new("file.txt")));
|
||||
assert!(!is_image_path(Path::new("file.mp3")));
|
||||
assert!(!is_image_path(Path::new("file.pdf")));
|
||||
assert!(!is_image_path(Path::new("noext")));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn dhash_deterministic() {
|
||||
let img = image::DynamicImage::new_rgb8(100, 100);
|
||||
let h1 = compute_dhash(&img);
|
||||
let h2 = compute_dhash(&img);
|
||||
assert_eq!(h1, h2);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn dhash_solid_images_differ_from_gradient() {
|
||||
// Solid black: all pixels 0 -> dhash = 0 (no left > right)
|
||||
let black = image::DynamicImage::new_rgb8(64, 64);
|
||||
// Gradient: right-to-left (bright left, dark right) -> left > right = true
|
||||
let mut grad = image::RgbImage::new(64, 64);
|
||||
for y in 0..64 {
|
||||
for x in 0..64 {
|
||||
let v = (255 - x * 255 / 63) as u8;
|
||||
grad.put_pixel(x, y, image::Rgb([v, v, v]));
|
||||
}
|
||||
}
|
||||
let grad = image::DynamicImage::ImageRgb8(grad);
|
||||
let h_black = compute_dhash(&black);
|
||||
let h_grad = compute_dhash(&grad);
|
||||
assert_ne!(h_black, h_grad, "solid vs gradient hashes must differ");
|
||||
assert!(hamming(h_black, h_grad) > 8, "solid vs gradient should differ significantly");
|
||||
}
|
||||
|
||||
fn make_entry(path: &str, sha: &str, dhash: u64) -> ImageEntry {
|
||||
ImageEntry {
|
||||
path: PathBuf::from(path),
|
||||
sha256: sha.to_string(),
|
||||
dhash,
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn find_groups_empty_input() {
|
||||
let groups = find_duplicate_groups(&[], 8);
|
||||
assert!(groups.is_empty());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn find_groups_no_duplicates() {
|
||||
let entries = vec![
|
||||
make_entry("a.jpg", "aaa", 0),
|
||||
make_entry("b.jpg", "bbb", u64::MAX),
|
||||
];
|
||||
let groups = find_duplicate_groups(&entries, 8);
|
||||
assert!(groups.is_empty(), "different hash+dhash = no groups");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn find_groups_exact_only() {
|
||||
let entries = vec![
|
||||
make_entry("a.jpg", "same", 100),
|
||||
make_entry("b.jpg", "same", 100),
|
||||
make_entry("c.jpg", "diff", 999),
|
||||
];
|
||||
let groups = find_duplicate_groups(&entries, 8);
|
||||
assert_eq!(groups.iter().filter(|g| g.kind == DuplicateKind::Exact).count(), 1);
|
||||
let exact = groups.iter().find(|g| g.kind == DuplicateKind::Exact).unwrap();
|
||||
assert_eq!(exact.paths.len(), 2);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn find_groups_similar_only() {
|
||||
let entries = vec![
|
||||
make_entry("a.jpg", "aaa", 0b0000_0000),
|
||||
make_entry("b.jpg", "bbb", 0b0000_0011), // hamming=2
|
||||
make_entry("c.jpg", "ccc", u64::MAX), // far away
|
||||
];
|
||||
let groups = find_duplicate_groups(&entries, 8);
|
||||
assert!(groups.iter().any(|g| g.kind == DuplicateKind::Similar), "should find similar pair");
|
||||
assert!(!groups.iter().any(|g| g.paths.iter().any(|p| p == Path::new("c.jpg"))
|
||||
&& g.paths.iter().any(|p| p == Path::new("a.jpg"))), "c.jpg should not group with a.jpg");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn find_groups_threshold_boundary() {
|
||||
let entries = vec![
|
||||
make_entry("a.jpg", "aaa", 0),
|
||||
make_entry("b.jpg", "bbb", 0b1111_1111), // hamming=8
|
||||
];
|
||||
// threshold=8: should match
|
||||
let groups8 = find_duplicate_groups(&entries, 8);
|
||||
assert!(groups8.iter().any(|g| g.kind == DuplicateKind::Similar));
|
||||
// threshold=7: should NOT match
|
||||
let groups7 = find_duplicate_groups(&entries, 7);
|
||||
assert!(!groups7.iter().any(|g| g.kind == DuplicateKind::Similar));
|
||||
}
|
||||
}
|
||||
41
src/main.rs
41
src/main.rs
@@ -1,3 +1,42 @@
|
||||
use deduper::{find_duplicate_groups, scan_images, DuplicateKind};
|
||||
use std::env;
|
||||
use std::path::Path;
|
||||
|
||||
fn main() {
|
||||
println!("Hello, world!");
|
||||
let args: Vec<String> = env::args().collect();
|
||||
if args.len() < 2 {
|
||||
eprintln!("usage: deduper <folder> [hamming-threshold]");
|
||||
std::process::exit(1);
|
||||
}
|
||||
|
||||
let root = Path::new(&args[1]);
|
||||
let threshold = args
|
||||
.get(2)
|
||||
.and_then(|s| s.parse::<u32>().ok())
|
||||
.unwrap_or(8);
|
||||
|
||||
let entries = match scan_images(root) {
|
||||
Ok(v) => v,
|
||||
Err(e) => {
|
||||
eprintln!("scan error: {e}");
|
||||
std::process::exit(1);
|
||||
}
|
||||
};
|
||||
|
||||
let groups = find_duplicate_groups(&entries, threshold);
|
||||
if groups.is_empty() {
|
||||
println!("no image duplicates found");
|
||||
return;
|
||||
}
|
||||
|
||||
for (idx, group) in groups.iter().enumerate() {
|
||||
let kind = match group.kind {
|
||||
DuplicateKind::Exact => "exact",
|
||||
DuplicateKind::Similar => "similar",
|
||||
};
|
||||
println!("group {} [{}]", idx + 1, kind);
|
||||
for path in &group.paths {
|
||||
println!(" {}", path.display());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
124
tests/image_phase.rs
Normal file
124
tests/image_phase.rs
Normal file
@@ -0,0 +1,124 @@
|
||||
use deduper::{find_duplicate_groups, hamming, scan_images, DuplicateKind};
|
||||
use std::path::Path;
|
||||
|
||||
fn fixture(name: &str) -> String {
|
||||
format!("/a0/usr/projects/deduper/.a0proj/test_media/images/{name}")
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn image_phase_real_files_red_green() {
|
||||
let dir = Path::new("/a0/usr/projects/deduper/.a0proj/test_media/images");
|
||||
let entries = scan_images(dir).expect("scan images");
|
||||
assert!(entries.len() >= 5, "need fixtures");
|
||||
|
||||
let orig = entries
|
||||
.iter()
|
||||
.find(|e| e.path == Path::new(&fixture("orig.jpg")))
|
||||
.unwrap();
|
||||
let copy = entries
|
||||
.iter()
|
||||
.find(|e| e.path == Path::new(&fixture("orig_copy.jpg")))
|
||||
.unwrap();
|
||||
let resized = entries
|
||||
.iter()
|
||||
.find(|e| e.path == Path::new(&fixture("orig_resized.jpg")))
|
||||
.unwrap();
|
||||
let blue = entries
|
||||
.iter()
|
||||
.find(|e| e.path == Path::new(&fixture("solid_blue.jpg")))
|
||||
.unwrap();
|
||||
|
||||
assert_eq!(orig.sha256, copy.sha256);
|
||||
assert!(hamming(orig.dhash, resized.dhash) <= 8, "resized should be similar");
|
||||
assert!(hamming(orig.dhash, blue.dhash) > 8, "blue should be unrelated");
|
||||
|
||||
let groups = find_duplicate_groups(&entries, 8);
|
||||
assert!(groups.iter().any(|g| {
|
||||
g.kind == DuplicateKind::Exact
|
||||
&& g.paths.iter().any(|p| p.ends_with("orig.jpg"))
|
||||
&& g.paths.iter().any(|p| p.ends_with("orig_copy.jpg"))
|
||||
}), "missing exact group");
|
||||
assert!(groups.iter().any(|g| {
|
||||
g.kind == DuplicateKind::Similar
|
||||
&& g.paths.iter().any(|p| p.ends_with("orig.jpg"))
|
||||
&& g.paths.iter().any(|p| p.ends_with("orig_resized.jpg"))
|
||||
}), "missing similar group");
|
||||
assert!(!groups.iter().any(|g| {
|
||||
g.paths.iter().any(|p| p.ends_with("solid_blue.jpg"))
|
||||
&& g.paths.iter().any(|p| p.ends_with("orig.jpg"))
|
||||
}), "false positive with unrelated image");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn scan_empty_dir_returns_no_entries() {
|
||||
let dir = Path::new("/a0/usr/projects/deduper/.a0proj/test_media/images/empty_dir");
|
||||
let entries = scan_images(dir).expect("scan empty dir");
|
||||
assert!(entries.is_empty(), "empty dir should yield no entries");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn cropped_image_similar_to_original() {
|
||||
let dir = Path::new("/a0/usr/projects/deduper/.a0proj/test_media/images");
|
||||
let entries = scan_images(dir).expect("scan");
|
||||
let orig = entries.iter().find(|e| e.path.ends_with("orig.jpg")).expect("orig.jpg");
|
||||
let cropped = entries.iter().find(|e| e.path.ends_with("orig_cropped.jpg")).expect("orig_cropped.jpg");
|
||||
assert_ne!(orig.sha256, cropped.sha256, "cropped should differ in bytes");
|
||||
assert!(hamming(orig.dhash, cropped.dhash) <= 12, "cropped should be perceptually similar (hamming <= 12)");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn non_image_files_excluded_from_scan() {
|
||||
let dir = Path::new("/a0/usr/projects/deduper/.a0proj/test_media/images");
|
||||
let entries = scan_images(dir).expect("scan");
|
||||
// readme.txt and data.csv exist in dir but must not appear in results
|
||||
assert!(!entries.iter().any(|e| e.path.ends_with("readme.txt")), "txt file should be excluded");
|
||||
assert!(!entries.iter().any(|e| e.path.ends_with("data.csv")), "csv file should be excluded");
|
||||
// all entries should have image extensions
|
||||
for e in &entries {
|
||||
let ext = e.path.extension().unwrap().to_str().unwrap().to_ascii_lowercase();
|
||||
assert!(matches!(ext.as_str(), "jpg"|"jpeg"|"png"|"webp"|"bmp"|"gif"|"tif"|"tiff"),
|
||||
"unexpected ext: {ext} in {}", e.path.display());
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn scan_recurses_into_subdirectories() {
|
||||
let dir = Path::new("/a0/usr/projects/deduper/.a0proj/test_media/images");
|
||||
let entries = scan_images(dir).expect("scan");
|
||||
// gradient_sub.jpg lives in subdir/ - must be found
|
||||
let sub = entries.iter().find(|e| e.path.ends_with("subdir/gradient_sub.jpg"));
|
||||
assert!(sub.is_some(), "should find image in subdirectory");
|
||||
// it's an exact copy of gradient.jpg
|
||||
let grad = entries.iter().find(|e| e.path.ends_with("gradient.jpg") && !e.path.to_str().unwrap().contains("subdir")).unwrap();
|
||||
let sub = sub.unwrap();
|
||||
assert_eq!(grad.sha256, sub.sha256, "exact copy should have same sha256");
|
||||
let groups = find_duplicate_groups(&entries, 8);
|
||||
assert!(groups.iter().any(|g| {
|
||||
g.kind == DuplicateKind::Exact
|
||||
&& g.paths.iter().any(|p| p.ends_with("gradient.jpg") && !p.to_str().unwrap().contains("subdir"))
|
||||
&& g.paths.iter().any(|p| p.ends_with("gradient_sub.jpg"))
|
||||
}), "should group gradient.jpg and subdir/gradient_sub.jpg as exact");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn single_image_no_duplicates() {
|
||||
let dir = Path::new("/a0/usr/projects/deduper/.a0proj/test_media/images/single");
|
||||
let entries = scan_images(dir).expect("scan");
|
||||
assert_eq!(entries.len(), 1, "should find exactly one image");
|
||||
let groups = find_duplicate_groups(&entries, 8);
|
||||
assert!(groups.is_empty(), "single image should produce no duplicate groups");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn cli_binary_reports_duplicates() {
|
||||
let bin = env!("CARGO_BIN_EXE_deduper");
|
||||
let output = std::process::Command::new(bin)
|
||||
.arg("/a0/usr/projects/deduper/.a0proj/test_media/images")
|
||||
.arg("8")
|
||||
.output()
|
||||
.expect("failed to run deduper binary");
|
||||
let stdout = String::from_utf8_lossy(&output.stdout);
|
||||
assert!(output.status.success(), "binary should exit 0");
|
||||
assert!(stdout.contains("[exact]"), "output should contain exact groups: {stdout}");
|
||||
assert!(stdout.contains("[similar]"), "output should contain similar groups: {stdout}");
|
||||
}
|
||||
Reference in New Issue
Block a user