diff --git a/src/lib.rs b/src/lib.rs index a0147c8..051bf63 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -13,6 +13,7 @@ pub struct ImageEntry { pub path: PathBuf, pub sha256: String, pub dhash: u64, + pub file_size: u64, } #[derive(Debug, Clone, PartialEq, Eq)] @@ -68,6 +69,7 @@ pub fn scan_images(root: &Path) -> Result> { path: path.to_path_buf(), sha256, dhash, + file_size: bytes.len() as u64, }); } Ok(out) @@ -96,8 +98,15 @@ pub fn compute_dhash(img: &image::DynamicImage) -> u64 { pub fn hamming(a: u64, b: u64) -> u32 { (a ^ b).count_ones() } - pub fn find_duplicate_groups(entries: &[ImageEntry], hamming_threshold: u32) -> Vec { + find_duplicate_groups_with_size_ratio(entries, hamming_threshold, 0.80) +} + +pub fn find_duplicate_groups_with_size_ratio( + entries: &[ImageEntry], + hamming_threshold: u32, + min_size_ratio: f64, +) -> Vec { let mut groups = Vec::new(); let mut exact: HashMap<&str, Vec> = HashMap::new(); @@ -138,7 +147,14 @@ pub fn find_duplicate_groups(entries: &[ImageEntry], hamming_threshold: u32) -> continue; } if hamming(entries[i].dhash, entries[j].dhash) <= hamming_threshold { - union(&mut parent, i, j); + let (small, big) = if entries[i].file_size <= entries[j].file_size { + (entries[i].file_size, entries[j].file_size) + } else { + (entries[j].file_size, entries[i].file_size) + }; + if big == 0 || (small as f64 / big as f64) >= min_size_ratio { + union(&mut parent, i, j); + } } } } @@ -245,6 +261,16 @@ mod tests { path: PathBuf::from(path), sha256: sha.to_string(), dhash, + file_size: 1000, + } + } + + fn make_entry_sized(path: &str, sha: &str, dhash: u64, file_size: u64) -> ImageEntry { + ImageEntry { + path: PathBuf::from(path), + sha256: sha.to_string(), + dhash, + file_size, } } @@ -303,4 +329,26 @@ mod tests { let groups7 = find_duplicate_groups(&entries, 7); assert!(!groups7.iter().any(|g| g.kind == DuplicateKind::Similar)); } + + #[test] + fn similar_groups_filtered_by_size_ratio() { + // Same dhash (hamming=0) but very different file sizes + let entries = vec![ + make_entry_sized("big.jpg", "aaa", 0b0000_0000, 100_000), // 100KB + make_entry_sized("tiny.jpg", "bbb", 0b0000_0001, 5_000), // 5KB (5% of big) + ]; + // Default 80% ratio should NOT group them + let groups = find_duplicate_groups(&entries, 8); + assert!(!groups.iter().any(|g| g.kind == DuplicateKind::Similar), + "files with very different sizes should not be grouped as similar"); + + // Same dhash, similar file sizes should group + let entries2 = vec![ + make_entry_sized("a.jpg", "aaa", 0b0000_0000, 100_000), + make_entry_sized("b.jpg", "bbb", 0b0000_0001, 90_000), // 90% of big + ]; + let groups2 = find_duplicate_groups(&entries2, 8); + assert!(groups2.iter().any(|g| g.kind == DuplicateKind::Similar), + "files with similar sizes should be grouped"); + } }