feat: filter similar groups by file size ratio (80% default)

- Thumbnails vs originals no longer grouped as similar
- Added file_size to ImageEntry
- find_duplicate_groups now checks min size ratio for similar pairs
- New API: find_duplicate_groups_with_size_ratio for custom ratio
- New test: similar_groups_filtered_by_size_ratio
- 24 tests passing
This commit is contained in:
admin
2026-04-28 00:28:18 +00:00
parent 22b977d9c4
commit 03728a11e9

View File

@@ -13,6 +13,7 @@ pub struct ImageEntry {
pub path: PathBuf,
pub sha256: String,
pub dhash: u64,
pub file_size: u64,
}
#[derive(Debug, Clone, PartialEq, Eq)]
@@ -68,6 +69,7 @@ pub fn scan_images(root: &Path) -> Result<Vec<ImageEntry>> {
path: path.to_path_buf(),
sha256,
dhash,
file_size: bytes.len() as u64,
});
}
Ok(out)
@@ -96,8 +98,15 @@ pub fn compute_dhash(img: &image::DynamicImage) -> u64 {
pub fn hamming(a: u64, b: u64) -> u32 {
(a ^ b).count_ones()
}
pub fn find_duplicate_groups(entries: &[ImageEntry], hamming_threshold: u32) -> Vec<DuplicateGroup> {
find_duplicate_groups_with_size_ratio(entries, hamming_threshold, 0.80)
}
pub fn find_duplicate_groups_with_size_ratio(
entries: &[ImageEntry],
hamming_threshold: u32,
min_size_ratio: f64,
) -> Vec<DuplicateGroup> {
let mut groups = Vec::new();
let mut exact: HashMap<&str, Vec<PathBuf>> = HashMap::new();
@@ -138,10 +147,17 @@ pub fn find_duplicate_groups(entries: &[ImageEntry], hamming_threshold: u32) ->
continue;
}
if hamming(entries[i].dhash, entries[j].dhash) <= hamming_threshold {
let (small, big) = if entries[i].file_size <= entries[j].file_size {
(entries[i].file_size, entries[j].file_size)
} else {
(entries[j].file_size, entries[i].file_size)
};
if big == 0 || (small as f64 / big as f64) >= min_size_ratio {
union(&mut parent, i, j);
}
}
}
}
let mut similar: HashMap<usize, Vec<PathBuf>> = HashMap::new();
for (idx, e) in entries.iter().enumerate() {
@@ -245,6 +261,16 @@ mod tests {
path: PathBuf::from(path),
sha256: sha.to_string(),
dhash,
file_size: 1000,
}
}
fn make_entry_sized(path: &str, sha: &str, dhash: u64, file_size: u64) -> ImageEntry {
ImageEntry {
path: PathBuf::from(path),
sha256: sha.to_string(),
dhash,
file_size,
}
}
@@ -303,4 +329,26 @@ mod tests {
let groups7 = find_duplicate_groups(&entries, 7);
assert!(!groups7.iter().any(|g| g.kind == DuplicateKind::Similar));
}
#[test]
fn similar_groups_filtered_by_size_ratio() {
// Same dhash (hamming=0) but very different file sizes
let entries = vec![
make_entry_sized("big.jpg", "aaa", 0b0000_0000, 100_000), // 100KB
make_entry_sized("tiny.jpg", "bbb", 0b0000_0001, 5_000), // 5KB (5% of big)
];
// Default 80% ratio should NOT group them
let groups = find_duplicate_groups(&entries, 8);
assert!(!groups.iter().any(|g| g.kind == DuplicateKind::Similar),
"files with very different sizes should not be grouped as similar");
// Same dhash, similar file sizes should group
let entries2 = vec![
make_entry_sized("a.jpg", "aaa", 0b0000_0000, 100_000),
make_entry_sized("b.jpg", "bbb", 0b0000_0001, 90_000), // 90% of big
];
let groups2 = find_duplicate_groups(&entries2, 8);
assert!(groups2.iter().any(|g| g.kind == DuplicateKind::Similar),
"files with similar sizes should be grouped");
}
}