feat: complete image phase - SHA-256 exact + dHash perceptual duplicate detection

- lib.rs: scan_images, compute_dhash, hamming, find_duplicate_groups
- main.rs: CLI with folder arg and optional hamming threshold
- 13 unit tests: hamming, is_image_path, dhash, find_duplicate_groups
- 7 integration tests: real files, empty dir, cropped, non-image exclusion,
  subdirectory recursion, single file, CLI binary output
- All 20 tests passing
This commit is contained in:
admin
2026-04-27 23:33:27 +00:00
parent 71c8df2de5
commit e1f8201b5c
6 changed files with 1556 additions and 1 deletions

1
.gitignore vendored
View File

@@ -1 +1,2 @@
/target
.ssh/

1101
Cargo.lock generated Normal file

File diff suppressed because it is too large Load Diff

View File

@@ -4,3 +4,7 @@ version = "0.1.0"
edition = "2024"
[dependencies]
image = { version = "0.25", default-features = true, features = ["jpeg", "png", "gif", "webp", "bmp", "tiff"] }
sha2 = "0.10"
walkdir = "2.5"
anyhow = "1"

286
src/lib.rs Normal file
View File

@@ -0,0 +1,286 @@
use anyhow::Result;
use image::imageops::FilterType;
use sha2::{Digest, Sha256};
use std::collections::{HashMap, HashSet};
use std::fs;
use std::path::{Path, PathBuf};
use walkdir::WalkDir;
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct ImageEntry {
pub path: PathBuf,
pub sha256: String,
pub dhash: u64,
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct DuplicateGroup {
pub kind: DuplicateKind,
pub paths: Vec<PathBuf>,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum DuplicateKind {
Exact,
Similar,
}
fn is_image_path(path: &Path) -> bool {
path.extension()
.and_then(|e| e.to_str())
.map(|e| matches!(e.to_ascii_lowercase().as_str(), "jpg" | "jpeg" | "png" | "webp" | "bmp" | "gif" | "tif" | "tiff"))
.unwrap_or(false)
}
pub fn scan_images(root: &Path) -> Result<Vec<ImageEntry>> {
let mut out = Vec::new();
for entry in WalkDir::new(root).follow_links(true) {
let entry = entry?;
let path = entry.path();
if !entry.file_type().is_file() || !is_image_path(path) {
continue;
}
let bytes = fs::read(path)?;
let sha256 = format!("{:x}", Sha256::digest(&bytes));
let img = image::open(path)?;
let dhash = compute_dhash(&img);
out.push(ImageEntry {
path: path.to_path_buf(),
sha256,
dhash,
});
}
Ok(out)
}
pub fn compute_dhash(img: &image::DynamicImage) -> u64 {
let gray = img
.grayscale()
.resize_exact(9, 8, FilterType::Triangle)
.to_luma8();
let mut hash = 0u64;
let mut bit = 0;
for y in 0..8 {
for x in 0..8 {
let left = gray.get_pixel(x, y)[0];
let right = gray.get_pixel(x + 1, y)[0];
if left > right {
hash |= 1 << bit;
}
bit += 1;
}
}
hash
}
pub fn hamming(a: u64, b: u64) -> u32 {
(a ^ b).count_ones()
}
pub fn find_duplicate_groups(entries: &[ImageEntry], hamming_threshold: u32) -> Vec<DuplicateGroup> {
let mut groups = Vec::new();
let mut exact: HashMap<&str, Vec<PathBuf>> = HashMap::new();
for e in entries {
exact.entry(&e.sha256).or_default().push(e.path.clone());
}
for paths in exact.into_values() {
if paths.len() > 1 {
groups.push(DuplicateGroup {
kind: DuplicateKind::Exact,
paths,
});
}
}
let n = entries.len();
let mut parent: Vec<usize> = (0..n).collect();
fn find(parent: &mut [usize], x: usize) -> usize {
if parent[x] != x {
let p = parent[x];
parent[x] = find(parent, p);
}
parent[x]
}
fn union(parent: &mut [usize], a: usize, b: usize) {
let ra = find(parent, a);
let rb = find(parent, b);
if ra != rb {
parent[rb] = ra;
}
}
for i in 0..n {
for j in (i + 1)..n {
if entries[i].sha256 == entries[j].sha256 {
continue;
}
if hamming(entries[i].dhash, entries[j].dhash) <= hamming_threshold {
union(&mut parent, i, j);
}
}
}
let mut similar: HashMap<usize, Vec<PathBuf>> = HashMap::new();
for (idx, e) in entries.iter().enumerate() {
let root = find(&mut parent, idx);
similar.entry(root).or_default().push(e.path.clone());
}
let exact_paths: HashSet<PathBuf> = groups
.iter()
.flat_map(|g| g.paths.iter().cloned())
.collect();
for paths in similar.into_values() {
if paths.len() > 1 {
let non_exact = paths.iter().filter(|p| !exact_paths.contains(*p)).count();
if non_exact >= 2 {
groups.push(DuplicateGroup {
kind: DuplicateKind::Similar,
paths,
});
}
}
}
groups
}
#[cfg(test)]
mod tests {
use super::*;
use std::path::PathBuf;
#[test]
fn hamming_identical() {
assert_eq!(hamming(0, 0), 0);
assert_eq!(hamming(u64::MAX, u64::MAX), 0);
}
#[test]
fn hamming_opposite() {
assert_eq!(hamming(0, u64::MAX), 64);
}
#[test]
fn hamming_single_bit() {
assert_eq!(hamming(0b0000, 0b0001), 1);
assert_eq!(hamming(0b1010, 0b1000), 1);
}
#[test]
fn is_image_path_accepts_valid_extensions() {
for ext in &["jpg", "jpeg", "png", "webp", "bmp", "gif", "tif", "tiff"] {
let p = PathBuf::from(format!("photo.{ext}"));
assert!(is_image_path(&p), "should accept .{ext}");
}
}
#[test]
fn is_image_path_case_insensitive() {
assert!(is_image_path(Path::new("photo.JPG")));
assert!(is_image_path(Path::new("photo.Png")));
}
#[test]
fn is_image_path_rejects_non_image() {
assert!(!is_image_path(Path::new("file.txt")));
assert!(!is_image_path(Path::new("file.mp3")));
assert!(!is_image_path(Path::new("file.pdf")));
assert!(!is_image_path(Path::new("noext")));
}
#[test]
fn dhash_deterministic() {
let img = image::DynamicImage::new_rgb8(100, 100);
let h1 = compute_dhash(&img);
let h2 = compute_dhash(&img);
assert_eq!(h1, h2);
}
#[test]
fn dhash_solid_images_differ_from_gradient() {
// Solid black: all pixels 0 -> dhash = 0 (no left > right)
let black = image::DynamicImage::new_rgb8(64, 64);
// Gradient: right-to-left (bright left, dark right) -> left > right = true
let mut grad = image::RgbImage::new(64, 64);
for y in 0..64 {
for x in 0..64 {
let v = (255 - x * 255 / 63) as u8;
grad.put_pixel(x, y, image::Rgb([v, v, v]));
}
}
let grad = image::DynamicImage::ImageRgb8(grad);
let h_black = compute_dhash(&black);
let h_grad = compute_dhash(&grad);
assert_ne!(h_black, h_grad, "solid vs gradient hashes must differ");
assert!(hamming(h_black, h_grad) > 8, "solid vs gradient should differ significantly");
}
fn make_entry(path: &str, sha: &str, dhash: u64) -> ImageEntry {
ImageEntry {
path: PathBuf::from(path),
sha256: sha.to_string(),
dhash,
}
}
#[test]
fn find_groups_empty_input() {
let groups = find_duplicate_groups(&[], 8);
assert!(groups.is_empty());
}
#[test]
fn find_groups_no_duplicates() {
let entries = vec![
make_entry("a.jpg", "aaa", 0),
make_entry("b.jpg", "bbb", u64::MAX),
];
let groups = find_duplicate_groups(&entries, 8);
assert!(groups.is_empty(), "different hash+dhash = no groups");
}
#[test]
fn find_groups_exact_only() {
let entries = vec![
make_entry("a.jpg", "same", 100),
make_entry("b.jpg", "same", 100),
make_entry("c.jpg", "diff", 999),
];
let groups = find_duplicate_groups(&entries, 8);
assert_eq!(groups.iter().filter(|g| g.kind == DuplicateKind::Exact).count(), 1);
let exact = groups.iter().find(|g| g.kind == DuplicateKind::Exact).unwrap();
assert_eq!(exact.paths.len(), 2);
}
#[test]
fn find_groups_similar_only() {
let entries = vec![
make_entry("a.jpg", "aaa", 0b0000_0000),
make_entry("b.jpg", "bbb", 0b0000_0011), // hamming=2
make_entry("c.jpg", "ccc", u64::MAX), // far away
];
let groups = find_duplicate_groups(&entries, 8);
assert!(groups.iter().any(|g| g.kind == DuplicateKind::Similar), "should find similar pair");
assert!(!groups.iter().any(|g| g.paths.iter().any(|p| p == Path::new("c.jpg"))
&& g.paths.iter().any(|p| p == Path::new("a.jpg"))), "c.jpg should not group with a.jpg");
}
#[test]
fn find_groups_threshold_boundary() {
let entries = vec![
make_entry("a.jpg", "aaa", 0),
make_entry("b.jpg", "bbb", 0b1111_1111), // hamming=8
];
// threshold=8: should match
let groups8 = find_duplicate_groups(&entries, 8);
assert!(groups8.iter().any(|g| g.kind == DuplicateKind::Similar));
// threshold=7: should NOT match
let groups7 = find_duplicate_groups(&entries, 7);
assert!(!groups7.iter().any(|g| g.kind == DuplicateKind::Similar));
}
}

View File

@@ -1,3 +1,42 @@
use deduper::{find_duplicate_groups, scan_images, DuplicateKind};
use std::env;
use std::path::Path;
fn main() {
println!("Hello, world!");
let args: Vec<String> = env::args().collect();
if args.len() < 2 {
eprintln!("usage: deduper <folder> [hamming-threshold]");
std::process::exit(1);
}
let root = Path::new(&args[1]);
let threshold = args
.get(2)
.and_then(|s| s.parse::<u32>().ok())
.unwrap_or(8);
let entries = match scan_images(root) {
Ok(v) => v,
Err(e) => {
eprintln!("scan error: {e}");
std::process::exit(1);
}
};
let groups = find_duplicate_groups(&entries, threshold);
if groups.is_empty() {
println!("no image duplicates found");
return;
}
for (idx, group) in groups.iter().enumerate() {
let kind = match group.kind {
DuplicateKind::Exact => "exact",
DuplicateKind::Similar => "similar",
};
println!("group {} [{}]", idx + 1, kind);
for path in &group.paths {
println!(" {}", path.display());
}
}
}

124
tests/image_phase.rs Normal file
View File

@@ -0,0 +1,124 @@
use deduper::{find_duplicate_groups, hamming, scan_images, DuplicateKind};
use std::path::Path;
fn fixture(name: &str) -> String {
format!("/a0/usr/projects/deduper/.a0proj/test_media/images/{name}")
}
#[test]
fn image_phase_real_files_red_green() {
let dir = Path::new("/a0/usr/projects/deduper/.a0proj/test_media/images");
let entries = scan_images(dir).expect("scan images");
assert!(entries.len() >= 5, "need fixtures");
let orig = entries
.iter()
.find(|e| e.path == Path::new(&fixture("orig.jpg")))
.unwrap();
let copy = entries
.iter()
.find(|e| e.path == Path::new(&fixture("orig_copy.jpg")))
.unwrap();
let resized = entries
.iter()
.find(|e| e.path == Path::new(&fixture("orig_resized.jpg")))
.unwrap();
let blue = entries
.iter()
.find(|e| e.path == Path::new(&fixture("solid_blue.jpg")))
.unwrap();
assert_eq!(orig.sha256, copy.sha256);
assert!(hamming(orig.dhash, resized.dhash) <= 8, "resized should be similar");
assert!(hamming(orig.dhash, blue.dhash) > 8, "blue should be unrelated");
let groups = find_duplicate_groups(&entries, 8);
assert!(groups.iter().any(|g| {
g.kind == DuplicateKind::Exact
&& g.paths.iter().any(|p| p.ends_with("orig.jpg"))
&& g.paths.iter().any(|p| p.ends_with("orig_copy.jpg"))
}), "missing exact group");
assert!(groups.iter().any(|g| {
g.kind == DuplicateKind::Similar
&& g.paths.iter().any(|p| p.ends_with("orig.jpg"))
&& g.paths.iter().any(|p| p.ends_with("orig_resized.jpg"))
}), "missing similar group");
assert!(!groups.iter().any(|g| {
g.paths.iter().any(|p| p.ends_with("solid_blue.jpg"))
&& g.paths.iter().any(|p| p.ends_with("orig.jpg"))
}), "false positive with unrelated image");
}
#[test]
fn scan_empty_dir_returns_no_entries() {
let dir = Path::new("/a0/usr/projects/deduper/.a0proj/test_media/images/empty_dir");
let entries = scan_images(dir).expect("scan empty dir");
assert!(entries.is_empty(), "empty dir should yield no entries");
}
#[test]
fn cropped_image_similar_to_original() {
let dir = Path::new("/a0/usr/projects/deduper/.a0proj/test_media/images");
let entries = scan_images(dir).expect("scan");
let orig = entries.iter().find(|e| e.path.ends_with("orig.jpg")).expect("orig.jpg");
let cropped = entries.iter().find(|e| e.path.ends_with("orig_cropped.jpg")).expect("orig_cropped.jpg");
assert_ne!(orig.sha256, cropped.sha256, "cropped should differ in bytes");
assert!(hamming(orig.dhash, cropped.dhash) <= 12, "cropped should be perceptually similar (hamming <= 12)");
}
#[test]
fn non_image_files_excluded_from_scan() {
let dir = Path::new("/a0/usr/projects/deduper/.a0proj/test_media/images");
let entries = scan_images(dir).expect("scan");
// readme.txt and data.csv exist in dir but must not appear in results
assert!(!entries.iter().any(|e| e.path.ends_with("readme.txt")), "txt file should be excluded");
assert!(!entries.iter().any(|e| e.path.ends_with("data.csv")), "csv file should be excluded");
// all entries should have image extensions
for e in &entries {
let ext = e.path.extension().unwrap().to_str().unwrap().to_ascii_lowercase();
assert!(matches!(ext.as_str(), "jpg"|"jpeg"|"png"|"webp"|"bmp"|"gif"|"tif"|"tiff"),
"unexpected ext: {ext} in {}", e.path.display());
}
}
#[test]
fn scan_recurses_into_subdirectories() {
let dir = Path::new("/a0/usr/projects/deduper/.a0proj/test_media/images");
let entries = scan_images(dir).expect("scan");
// gradient_sub.jpg lives in subdir/ - must be found
let sub = entries.iter().find(|e| e.path.ends_with("subdir/gradient_sub.jpg"));
assert!(sub.is_some(), "should find image in subdirectory");
// it's an exact copy of gradient.jpg
let grad = entries.iter().find(|e| e.path.ends_with("gradient.jpg") && !e.path.to_str().unwrap().contains("subdir")).unwrap();
let sub = sub.unwrap();
assert_eq!(grad.sha256, sub.sha256, "exact copy should have same sha256");
let groups = find_duplicate_groups(&entries, 8);
assert!(groups.iter().any(|g| {
g.kind == DuplicateKind::Exact
&& g.paths.iter().any(|p| p.ends_with("gradient.jpg") && !p.to_str().unwrap().contains("subdir"))
&& g.paths.iter().any(|p| p.ends_with("gradient_sub.jpg"))
}), "should group gradient.jpg and subdir/gradient_sub.jpg as exact");
}
#[test]
fn single_image_no_duplicates() {
let dir = Path::new("/a0/usr/projects/deduper/.a0proj/test_media/images/single");
let entries = scan_images(dir).expect("scan");
assert_eq!(entries.len(), 1, "should find exactly one image");
let groups = find_duplicate_groups(&entries, 8);
assert!(groups.is_empty(), "single image should produce no duplicate groups");
}
#[test]
fn cli_binary_reports_duplicates() {
let bin = env!("CARGO_BIN_EXE_deduper");
let output = std::process::Command::new(bin)
.arg("/a0/usr/projects/deduper/.a0proj/test_media/images")
.arg("8")
.output()
.expect("failed to run deduper binary");
let stdout = String::from_utf8_lossy(&output.stdout);
assert!(output.status.success(), "binary should exit 0");
assert!(stdout.contains("[exact]"), "output should contain exact groups: {stdout}");
assert!(stdout.contains("[similar]"), "output should contain similar groups: {stdout}");
}