feat: detect image format from magic bytes, not file extension

- Fixes misnamed files (e.g. JPEG saved as .png) being skipped
- Uses image::ImageReader with guessed format from content
- Fixes Android screenshots with wrong extension being skipped
- New test: misnamed_jpeg_as_png_still_scanned
- 22 tests passing
This commit is contained in:
admin
2026-04-27 23:57:20 +00:00
parent deb5321a8a
commit 9dc8a495bb
2 changed files with 23 additions and 2 deletions

View File

@@ -1,8 +1,10 @@
use anyhow::Result; use anyhow::Result;
use image::imageops::FilterType; use image::imageops::FilterType;
use image::ImageReader;
use sha2::{Digest, Sha256}; use sha2::{Digest, Sha256};
use std::collections::{HashMap, HashSet}; use std::collections::{HashMap, HashSet};
use std::fs; use std::fs;
use std::io::Cursor;
use std::path::{Path, PathBuf}; use std::path::{Path, PathBuf};
use walkdir::WalkDir; use walkdir::WalkDir;
@@ -48,12 +50,18 @@ pub fn scan_images(root: &Path) -> Result<Vec<ImageEntry>> {
} }
}; };
let sha256 = format!("{:x}", Sha256::digest(&bytes)); let sha256 = format!("{:x}", Sha256::digest(&bytes));
let img = match image::open(path) { let img = match ImageReader::new(Cursor::new(&bytes)).with_guessed_format() {
Ok(reader) => match reader.decode() {
Ok(i) => i, Ok(i) => i,
Err(e) => { Err(e) => {
eprintln!("warning: skipping {}: {e}", path.display()); eprintln!("warning: skipping {}: {e}", path.display());
continue; continue;
} }
},
Err(e) => {
eprintln!("warning: skipping {}: {e}", path.display());
continue;
}
}; };
let dhash = compute_dhash(&img); let dhash = compute_dhash(&img);
out.push(ImageEntry { out.push(ImageEntry {

View File

@@ -133,3 +133,16 @@ fn cli_binary_reports_duplicates() {
assert!(stdout.contains("[exact]"), "output should contain exact groups: {stdout}"); assert!(stdout.contains("[exact]"), "output should contain exact groups: {stdout}");
assert!(stdout.contains("[similar]"), "output should contain similar groups: {stdout}"); assert!(stdout.contains("[similar]"), "output should contain similar groups: {stdout}");
} }
#[test]
fn misnamed_jpeg_as_png_still_scanned() {
// fake_png.png is actually JPEG data with .png extension
let dir = Path::new("/a0/usr/projects/deduper/.a0proj/test_media/images");
let entries = scan_images(dir).expect("scan");
let fake = entries.iter().find(|e| e.path.ends_with("fake_png.png"));
assert!(fake.is_some(), "misnamed JPEG-as-PNG should be scanned via magic bytes");
// should have same hash as orig.jpg since it's a copy
let orig = entries.iter().find(|e| e.path.ends_with("orig.jpg")).unwrap();
let fake = fake.unwrap();
assert_eq!(orig.sha256, fake.sha256, "same content = same sha256");
}