feat: filter similar groups by file size ratio (80% default)
- Thumbnails vs originals no longer grouped as similar - Added file_size to ImageEntry - find_duplicate_groups now checks min size ratio for similar pairs - New API: find_duplicate_groups_with_size_ratio for custom ratio - New test: similar_groups_filtered_by_size_ratio - 24 tests passing
This commit is contained in:
52
src/lib.rs
52
src/lib.rs
@@ -13,6 +13,7 @@ pub struct ImageEntry {
|
|||||||
pub path: PathBuf,
|
pub path: PathBuf,
|
||||||
pub sha256: String,
|
pub sha256: String,
|
||||||
pub dhash: u64,
|
pub dhash: u64,
|
||||||
|
pub file_size: u64,
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug, Clone, PartialEq, Eq)]
|
#[derive(Debug, Clone, PartialEq, Eq)]
|
||||||
@@ -68,6 +69,7 @@ pub fn scan_images(root: &Path) -> Result<Vec<ImageEntry>> {
|
|||||||
path: path.to_path_buf(),
|
path: path.to_path_buf(),
|
||||||
sha256,
|
sha256,
|
||||||
dhash,
|
dhash,
|
||||||
|
file_size: bytes.len() as u64,
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
Ok(out)
|
Ok(out)
|
||||||
@@ -96,8 +98,15 @@ pub fn compute_dhash(img: &image::DynamicImage) -> u64 {
|
|||||||
pub fn hamming(a: u64, b: u64) -> u32 {
|
pub fn hamming(a: u64, b: u64) -> u32 {
|
||||||
(a ^ b).count_ones()
|
(a ^ b).count_ones()
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn find_duplicate_groups(entries: &[ImageEntry], hamming_threshold: u32) -> Vec<DuplicateGroup> {
|
pub fn find_duplicate_groups(entries: &[ImageEntry], hamming_threshold: u32) -> Vec<DuplicateGroup> {
|
||||||
|
find_duplicate_groups_with_size_ratio(entries, hamming_threshold, 0.80)
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn find_duplicate_groups_with_size_ratio(
|
||||||
|
entries: &[ImageEntry],
|
||||||
|
hamming_threshold: u32,
|
||||||
|
min_size_ratio: f64,
|
||||||
|
) -> Vec<DuplicateGroup> {
|
||||||
let mut groups = Vec::new();
|
let mut groups = Vec::new();
|
||||||
|
|
||||||
let mut exact: HashMap<&str, Vec<PathBuf>> = HashMap::new();
|
let mut exact: HashMap<&str, Vec<PathBuf>> = HashMap::new();
|
||||||
@@ -138,7 +147,14 @@ pub fn find_duplicate_groups(entries: &[ImageEntry], hamming_threshold: u32) ->
|
|||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
if hamming(entries[i].dhash, entries[j].dhash) <= hamming_threshold {
|
if hamming(entries[i].dhash, entries[j].dhash) <= hamming_threshold {
|
||||||
union(&mut parent, i, j);
|
let (small, big) = if entries[i].file_size <= entries[j].file_size {
|
||||||
|
(entries[i].file_size, entries[j].file_size)
|
||||||
|
} else {
|
||||||
|
(entries[j].file_size, entries[i].file_size)
|
||||||
|
};
|
||||||
|
if big == 0 || (small as f64 / big as f64) >= min_size_ratio {
|
||||||
|
union(&mut parent, i, j);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -245,6 +261,16 @@ mod tests {
|
|||||||
path: PathBuf::from(path),
|
path: PathBuf::from(path),
|
||||||
sha256: sha.to_string(),
|
sha256: sha.to_string(),
|
||||||
dhash,
|
dhash,
|
||||||
|
file_size: 1000,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn make_entry_sized(path: &str, sha: &str, dhash: u64, file_size: u64) -> ImageEntry {
|
||||||
|
ImageEntry {
|
||||||
|
path: PathBuf::from(path),
|
||||||
|
sha256: sha.to_string(),
|
||||||
|
dhash,
|
||||||
|
file_size,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -303,4 +329,26 @@ mod tests {
|
|||||||
let groups7 = find_duplicate_groups(&entries, 7);
|
let groups7 = find_duplicate_groups(&entries, 7);
|
||||||
assert!(!groups7.iter().any(|g| g.kind == DuplicateKind::Similar));
|
assert!(!groups7.iter().any(|g| g.kind == DuplicateKind::Similar));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn similar_groups_filtered_by_size_ratio() {
|
||||||
|
// Same dhash (hamming=0) but very different file sizes
|
||||||
|
let entries = vec![
|
||||||
|
make_entry_sized("big.jpg", "aaa", 0b0000_0000, 100_000), // 100KB
|
||||||
|
make_entry_sized("tiny.jpg", "bbb", 0b0000_0001, 5_000), // 5KB (5% of big)
|
||||||
|
];
|
||||||
|
// Default 80% ratio should NOT group them
|
||||||
|
let groups = find_duplicate_groups(&entries, 8);
|
||||||
|
assert!(!groups.iter().any(|g| g.kind == DuplicateKind::Similar),
|
||||||
|
"files with very different sizes should not be grouped as similar");
|
||||||
|
|
||||||
|
// Same dhash, similar file sizes should group
|
||||||
|
let entries2 = vec![
|
||||||
|
make_entry_sized("a.jpg", "aaa", 0b0000_0000, 100_000),
|
||||||
|
make_entry_sized("b.jpg", "bbb", 0b0000_0001, 90_000), // 90% of big
|
||||||
|
];
|
||||||
|
let groups2 = find_duplicate_groups(&entries2, 8);
|
||||||
|
assert!(groups2.iter().any(|g| g.kind == DuplicateKind::Similar),
|
||||||
|
"files with similar sizes should be grouped");
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user