feat: filter similar groups by file size ratio (80% default)
- Thumbnails vs originals no longer grouped as similar - Added file_size to ImageEntry - find_duplicate_groups now checks min size ratio for similar pairs - New API: find_duplicate_groups_with_size_ratio for custom ratio - New test: similar_groups_filtered_by_size_ratio - 24 tests passing
This commit is contained in:
50
src/lib.rs
50
src/lib.rs
@@ -13,6 +13,7 @@ pub struct ImageEntry {
|
||||
pub path: PathBuf,
|
||||
pub sha256: String,
|
||||
pub dhash: u64,
|
||||
pub file_size: u64,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, PartialEq, Eq)]
|
||||
@@ -68,6 +69,7 @@ pub fn scan_images(root: &Path) -> Result<Vec<ImageEntry>> {
|
||||
path: path.to_path_buf(),
|
||||
sha256,
|
||||
dhash,
|
||||
file_size: bytes.len() as u64,
|
||||
});
|
||||
}
|
||||
Ok(out)
|
||||
@@ -96,8 +98,15 @@ pub fn compute_dhash(img: &image::DynamicImage) -> u64 {
|
||||
pub fn hamming(a: u64, b: u64) -> u32 {
|
||||
(a ^ b).count_ones()
|
||||
}
|
||||
|
||||
pub fn find_duplicate_groups(entries: &[ImageEntry], hamming_threshold: u32) -> Vec<DuplicateGroup> {
|
||||
find_duplicate_groups_with_size_ratio(entries, hamming_threshold, 0.80)
|
||||
}
|
||||
|
||||
pub fn find_duplicate_groups_with_size_ratio(
|
||||
entries: &[ImageEntry],
|
||||
hamming_threshold: u32,
|
||||
min_size_ratio: f64,
|
||||
) -> Vec<DuplicateGroup> {
|
||||
let mut groups = Vec::new();
|
||||
|
||||
let mut exact: HashMap<&str, Vec<PathBuf>> = HashMap::new();
|
||||
@@ -138,10 +147,17 @@ pub fn find_duplicate_groups(entries: &[ImageEntry], hamming_threshold: u32) ->
|
||||
continue;
|
||||
}
|
||||
if hamming(entries[i].dhash, entries[j].dhash) <= hamming_threshold {
|
||||
let (small, big) = if entries[i].file_size <= entries[j].file_size {
|
||||
(entries[i].file_size, entries[j].file_size)
|
||||
} else {
|
||||
(entries[j].file_size, entries[i].file_size)
|
||||
};
|
||||
if big == 0 || (small as f64 / big as f64) >= min_size_ratio {
|
||||
union(&mut parent, i, j);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let mut similar: HashMap<usize, Vec<PathBuf>> = HashMap::new();
|
||||
for (idx, e) in entries.iter().enumerate() {
|
||||
@@ -245,6 +261,16 @@ mod tests {
|
||||
path: PathBuf::from(path),
|
||||
sha256: sha.to_string(),
|
||||
dhash,
|
||||
file_size: 1000,
|
||||
}
|
||||
}
|
||||
|
||||
fn make_entry_sized(path: &str, sha: &str, dhash: u64, file_size: u64) -> ImageEntry {
|
||||
ImageEntry {
|
||||
path: PathBuf::from(path),
|
||||
sha256: sha.to_string(),
|
||||
dhash,
|
||||
file_size,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -303,4 +329,26 @@ mod tests {
|
||||
let groups7 = find_duplicate_groups(&entries, 7);
|
||||
assert!(!groups7.iter().any(|g| g.kind == DuplicateKind::Similar));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn similar_groups_filtered_by_size_ratio() {
|
||||
// Same dhash (hamming=0) but very different file sizes
|
||||
let entries = vec![
|
||||
make_entry_sized("big.jpg", "aaa", 0b0000_0000, 100_000), // 100KB
|
||||
make_entry_sized("tiny.jpg", "bbb", 0b0000_0001, 5_000), // 5KB (5% of big)
|
||||
];
|
||||
// Default 80% ratio should NOT group them
|
||||
let groups = find_duplicate_groups(&entries, 8);
|
||||
assert!(!groups.iter().any(|g| g.kind == DuplicateKind::Similar),
|
||||
"files with very different sizes should not be grouped as similar");
|
||||
|
||||
// Same dhash, similar file sizes should group
|
||||
let entries2 = vec![
|
||||
make_entry_sized("a.jpg", "aaa", 0b0000_0000, 100_000),
|
||||
make_entry_sized("b.jpg", "bbb", 0b0000_0001, 90_000), // 90% of big
|
||||
];
|
||||
let groups2 = find_duplicate_groups(&entries2, 8);
|
||||
assert!(groups2.iter().any(|g| g.kind == DuplicateKind::Similar),
|
||||
"files with similar sizes should be grouped");
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user