feat: detect image format from magic bytes, not file extension
- Fixes misnamed files (e.g. JPEG saved as .png) being skipped - Uses image::ImageReader with guessed format from content - Fixes Android screenshots with wrong extension being skipped - New test: misnamed_jpeg_as_png_still_scanned - 22 tests passing
This commit is contained in:
10
src/lib.rs
10
src/lib.rs
@@ -1,8 +1,10 @@
|
|||||||
use anyhow::Result;
|
use anyhow::Result;
|
||||||
use image::imageops::FilterType;
|
use image::imageops::FilterType;
|
||||||
|
use image::ImageReader;
|
||||||
use sha2::{Digest, Sha256};
|
use sha2::{Digest, Sha256};
|
||||||
use std::collections::{HashMap, HashSet};
|
use std::collections::{HashMap, HashSet};
|
||||||
use std::fs;
|
use std::fs;
|
||||||
|
use std::io::Cursor;
|
||||||
use std::path::{Path, PathBuf};
|
use std::path::{Path, PathBuf};
|
||||||
use walkdir::WalkDir;
|
use walkdir::WalkDir;
|
||||||
|
|
||||||
@@ -48,12 +50,18 @@ pub fn scan_images(root: &Path) -> Result<Vec<ImageEntry>> {
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
let sha256 = format!("{:x}", Sha256::digest(&bytes));
|
let sha256 = format!("{:x}", Sha256::digest(&bytes));
|
||||||
let img = match image::open(path) {
|
let img = match ImageReader::new(Cursor::new(&bytes)).with_guessed_format() {
|
||||||
|
Ok(reader) => match reader.decode() {
|
||||||
Ok(i) => i,
|
Ok(i) => i,
|
||||||
Err(e) => {
|
Err(e) => {
|
||||||
eprintln!("warning: skipping {}: {e}", path.display());
|
eprintln!("warning: skipping {}: {e}", path.display());
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
},
|
||||||
|
Err(e) => {
|
||||||
|
eprintln!("warning: skipping {}: {e}", path.display());
|
||||||
|
continue;
|
||||||
|
}
|
||||||
};
|
};
|
||||||
let dhash = compute_dhash(&img);
|
let dhash = compute_dhash(&img);
|
||||||
out.push(ImageEntry {
|
out.push(ImageEntry {
|
||||||
|
|||||||
@@ -133,3 +133,16 @@ fn cli_binary_reports_duplicates() {
|
|||||||
assert!(stdout.contains("[exact]"), "output should contain exact groups: {stdout}");
|
assert!(stdout.contains("[exact]"), "output should contain exact groups: {stdout}");
|
||||||
assert!(stdout.contains("[similar]"), "output should contain similar groups: {stdout}");
|
assert!(stdout.contains("[similar]"), "output should contain similar groups: {stdout}");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn misnamed_jpeg_as_png_still_scanned() {
|
||||||
|
// fake_png.png is actually JPEG data with .png extension
|
||||||
|
let dir = Path::new("/a0/usr/projects/deduper/.a0proj/test_media/images");
|
||||||
|
let entries = scan_images(dir).expect("scan");
|
||||||
|
let fake = entries.iter().find(|e| e.path.ends_with("fake_png.png"));
|
||||||
|
assert!(fake.is_some(), "misnamed JPEG-as-PNG should be scanned via magic bytes");
|
||||||
|
// should have same hash as orig.jpg since it's a copy
|
||||||
|
let orig = entries.iter().find(|e| e.path.ends_with("orig.jpg")).unwrap();
|
||||||
|
let fake = fake.unwrap();
|
||||||
|
assert_eq!(orig.sha256, fake.sha256, "same content = same sha256");
|
||||||
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user