feat: "Not a dupe" ignore with SQLite persistence

- New ignore_db module with SQLite-backed dismissal storage
- Groups flagged as not-a-dupe are persisted to ~/.config/deduper/ignores.db
- Fingerprint based on sorted SHA256 hashes (content-stable)
- Ignored groups filtered out on subsequent runs
- Review UI: green "Not a dupe" button per group
- Dismissed groups fade out immediately in browser
- DEDUPER_DB_DIR env var to override DB location
- 4 new unit tests for ignore_db
- 29 tests passing
This commit is contained in:
admin
2026-04-28 00:45:52 +00:00
parent f13b712e99
commit c039029790
6 changed files with 301 additions and 7 deletions

80
Cargo.lock generated
View File

@@ -8,6 +8,18 @@ version = "2.0.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "320119579fcad9c21884f5c4861d16174d0e06250625266f50fe6898340abefa"
[[package]]
name = "ahash"
version = "0.8.12"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5a15f179cd60c4584b8a8c596927aadc462e27f2ca70c04e0071964a73ba7a75"
dependencies = [
"cfg-if",
"once_cell",
"version_check",
"zerocopy",
]
[[package]]
name = "aligned"
version = "0.4.3"
@@ -276,6 +288,7 @@ dependencies = [
"base64",
"image",
"open",
"rusqlite",
"serde",
"serde_json",
"sha2",
@@ -334,6 +347,18 @@ dependencies = [
"zune-inflate",
]
[[package]]
name = "fallible-iterator"
version = "0.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2acce4a10f12dc2fb14a218589d4f1f62ef011b2d0cc4b3cb1bba8e94da14649"
[[package]]
name = "fallible-streaming-iterator"
version = "0.1.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7360491ce676a36bf9bb3c56c1aa791658183a54d2744120f27285738d90465a"
[[package]]
name = "fax"
version = "0.2.7"
@@ -408,6 +433,24 @@ dependencies = [
"zerocopy",
]
[[package]]
name = "hashbrown"
version = "0.14.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e5274423e17b7c9fc20b6e7e208532f9b19825d82dfd615708b70edd83df41f1"
dependencies = [
"ahash",
]
[[package]]
name = "hashlink"
version = "0.9.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6ba4ff7128dee98c7dc9794b6a411377e1404dba1c97deb8d1a55297bd25d8af"
dependencies = [
"hashbrown",
]
[[package]]
name = "httpdate"
version = "1.0.3"
@@ -531,6 +574,17 @@ dependencies = [
"cc",
]
[[package]]
name = "libsqlite3-sys"
version = "0.28.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0c10584274047cb335c23d3e61bcef8e323adae7c5c8c760540f73610177fc3f"
dependencies = [
"cc",
"pkg-config",
"vcpkg",
]
[[package]]
name = "log"
version = "0.4.29"
@@ -697,6 +751,12 @@ version = "0.2.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "df94ce210e5bc13cb6651479fa48d14f601d9858cfe0467f43ae157023b938d3"
[[package]]
name = "pkg-config"
version = "0.3.33"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "19f132c84eca552bf34cab8ec81f1c1dcc229b811638f9d283dceabe58c5569e"
[[package]]
name = "png"
version = "0.18.1"
@@ -888,6 +948,20 @@ version = "0.8.53"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "47b34b781b31e5d73e9fbc8689c70551fd1ade9a19e3e28cfec8580a79290cc4"
[[package]]
name = "rusqlite"
version = "0.31.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b838eba278d213a8beaf485bd313fd580ca4505a00d5871caeb1457c55322cae"
dependencies = [
"bitflags",
"fallible-iterator",
"fallible-streaming-iterator",
"hashlink",
"libsqlite3-sys",
"smallvec",
]
[[package]]
name = "rustversion"
version = "1.0.22"
@@ -1070,6 +1144,12 @@ dependencies = [
"wasm-bindgen",
]
[[package]]
name = "vcpkg"
version = "0.2.15"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "accd4ea62f7bb7a82fe23066fb0957d48ef677f6eeb8215f372f52e48bb32426"
[[package]]
name = "version_check"
version = "0.9.5"

View File

@@ -13,3 +13,4 @@ open = "5"
base64 = "0.22"
serde = { version = "1", features = ["derive"] }
serde_json = "1"
rusqlite = { version = "0.31", features = ["bundled"] }

125
src/ignore_db.rs Normal file
View File

@@ -0,0 +1,125 @@
use anyhow::Result;
use rusqlite::Connection;
use std::path::PathBuf;
fn db_path() -> PathBuf {
let dir = dirs_or_default();
std::fs::create_dir_all(&dir).ok();
dir.join("ignores.db")
}
fn dirs_or_default() -> PathBuf {
std::env::var("DEDUPER_DB_DIR")
.map(PathBuf::from)
.unwrap_or_else(|_| {
let home = std::env::var("HOME").unwrap_or_else(|_| "/tmp".to_string());
PathBuf::from(home).join(".config").join("deduper")
})
}
pub fn open_db() -> Result<Connection> {
let path = db_path();
let conn = Connection::open(&path)?;
conn.execute_batch(
"CREATE TABLE IF NOT EXISTS ignored_groups (
fingerprint TEXT PRIMARY KEY,
created_at TEXT DEFAULT (datetime('now')),
note TEXT DEFAULT ''
);",
)?;
Ok(conn)
}
/// Fingerprint = sorted sha256 hashes joined by `|`
pub fn group_fingerprint(sha256s: &[&str]) -> String {
let mut sorted: Vec<&str> = sha256s.to_vec();
sorted.sort();
sorted.dedup();
sorted.join("|")
}
pub fn ignore_group(conn: &Connection, fingerprint: &str) -> Result<()> {
conn.execute(
"INSERT OR IGNORE INTO ignored_groups (fingerprint) VALUES (?1)",
[fingerprint],
)?;
Ok(())
}
pub fn is_group_ignored(conn: &Connection, fingerprint: &str) -> bool {
conn.query_row(
"SELECT 1 FROM ignored_groups WHERE fingerprint = ?1",
[fingerprint],
|_| Ok(true),
)
.unwrap_or(false)
}
pub fn remove_ignore(conn: &Connection, fingerprint: &str) -> Result<()> {
conn.execute(
"DELETE FROM ignored_groups WHERE fingerprint = ?1",
[fingerprint],
)?;
Ok(())
}
pub fn list_ignored(conn: &Connection) -> Result<Vec<(String, String)>> {
let mut stmt = conn.prepare("SELECT fingerprint, created_at FROM ignored_groups ORDER BY created_at DESC")?;
let rows = stmt.query_map([], |row| {
Ok((row.get::<_, String>(0)?, row.get::<_, String>(1)?))
})?;
Ok(rows.filter_map(|r| r.ok()).collect())
}
pub fn open_db_in_memory() -> Result<Connection> {
let conn = Connection::open_in_memory()?;
conn.execute_batch(
"CREATE TABLE IF NOT EXISTS ignored_groups (
fingerprint TEXT PRIMARY KEY,
created_at TEXT DEFAULT (datetime('now')),
note TEXT DEFAULT ''
);",
)?;
Ok(conn)
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn ignore_and_check_group() {
let conn = open_db_in_memory().unwrap();
let fp = group_fingerprint(&["sha_b", "sha_a"]);
assert!(!is_group_ignored(&conn, &fp));
ignore_group(&conn, &fp).unwrap();
assert!(is_group_ignored(&conn, &fp));
}
#[test]
fn fingerprint_is_sorted_and_stable() {
let fp1 = group_fingerprint(&["bbb", "aaa"]);
let fp2 = group_fingerprint(&["aaa", "bbb"]);
assert_eq!(fp1, fp2);
assert_eq!(fp1, "aaa|bbb");
}
#[test]
fn remove_ignore_works() {
let conn = open_db_in_memory().unwrap();
let fp = group_fingerprint(&["x", "y"]);
ignore_group(&conn, &fp).unwrap();
assert!(is_group_ignored(&conn, &fp));
remove_ignore(&conn, &fp).unwrap();
assert!(!is_group_ignored(&conn, &fp));
}
#[test]
fn list_ignored_returns_entries() {
let conn = open_db_in_memory().unwrap();
ignore_group(&conn, "fp1").unwrap();
ignore_group(&conn, "fp2").unwrap();
let list = list_ignored(&conn).unwrap();
assert_eq!(list.len(), 2);
}
}

View File

@@ -1,3 +1,4 @@
pub mod ignore_db;
use anyhow::Result;
use image::imageops::FilterType;
use image::ImageReader;

View File

@@ -1,4 +1,5 @@
use deduper::{find_duplicate_groups, scan_images, DuplicateKind};
use deduper::ignore_db;
use std::env;
use std::path::Path;
@@ -56,14 +57,28 @@ fn main() {
}
};
let groups = find_duplicate_groups(&entries, config.threshold);
let mut groups = find_duplicate_groups(&entries, config.threshold);
// Filter out ignored groups
let db = ignore_db::open_db().ok();
if let Some(ref conn) = db {
groups.retain(|g| {
let sha_list: Vec<&str> = entries.iter()
.filter(|e| g.paths.contains(&e.path))
.map(|e| e.sha256.as_str())
.collect();
let fp = ignore_db::group_fingerprint(&sha_list);
!ignore_db::is_group_ignored(conn, &fp)
});
}
if groups.is_empty() {
println!("no image duplicates found");
return;
}
if config.review {
review::launch_review(&groups);
review::launch_review(&groups, &entries);
} else {
for (idx, group) in groups.iter().enumerate() {
let kind = match group.kind {

View File

@@ -1,12 +1,12 @@
use base64::Engine;
use deduper::{DuplicateGroup, DuplicateKind};
use deduper::{DuplicateGroup, DuplicateKind, ImageEntry, ignore_db};
use image::imageops::FilterType;
use std::fs;
use tiny_http::{Header, Method, Response, Server};
const THUMB_MAX: u32 = 300;
pub fn launch_review(groups: &[DuplicateGroup]) {
pub fn launch_review(groups: &[DuplicateGroup], entries: &[ImageEntry]) {
let port = find_open_port();
let addr = format!("127.0.0.1:{port}");
let server = Server::http(&addr).expect("failed to start review server");
@@ -15,6 +15,8 @@ pub fn launch_review(groups: &[DuplicateGroup]) {
let html = build_review_html(groups);
eprintln!("review server running at http://{addr}");
let db = ignore_db::open_db().ok();
let _ = open::that(format!("http://{addr}"));
loop {
@@ -34,6 +36,9 @@ pub fn launch_review(groups: &[DuplicateGroup]) {
(Method::Post, "/delete") => {
handle_delete(req);
}
(Method::Post, "/ignore") => {
handle_ignore(req, groups, entries, &db);
}
(Method::Post, "/shutdown") => {
let header = Header::from_bytes("Content-Type", "text/plain").unwrap();
let _ = req.respond(Response::from_string("bye").with_header(header));
@@ -81,6 +86,49 @@ fn handle_delete(mut req: tiny_http::Request) -> usize {
deleted
}
fn handle_ignore(
mut req: tiny_http::Request,
groups: &[DuplicateGroup],
entries: &[ImageEntry],
db: &Option<rusqlite::Connection>,
) {
let mut body = Vec::new();
let _ = std::io::Read::read_to_end(req.as_reader(), &mut body);
let body_str = String::from_utf8_lossy(&body);
// Body = group index as JSON number
let group_idx: usize = serde_json::from_str(&body_str).unwrap_or(usize::MAX);
let (ok, msg) = if let Some(conn) = db {
if let Some(group) = groups.get(group_idx) {
let sha_list: Vec<&str> = entries
.iter()
.filter(|e| group.paths.contains(&e.path))
.map(|e| e.sha256.as_str())
.collect();
let fp = ignore_db::group_fingerprint(&sha_list);
match ignore_db::ignore_group(conn, &fp) {
Ok(_) => {
eprintln!("ignored group {group_idx} (fingerprint: {fp})");
(true, "group ignored")
}
Err(e) => {
eprintln!("failed to ignore group: {e}");
(false, "db error")
}
}
} else {
(false, "invalid group index")
}
} else {
(false, "no database")
};
let json = format!("{{\"ok\":{ok},\"message\":\"{msg}\"}}");
let header = Header::from_bytes("Content-Type", "application/json").unwrap();
let _ = req.respond(Response::from_string(json).with_header(header));
}
fn make_thumbnail_data_uri(path: &std::path::Path) -> String {
let data = match fs::read(path) {
Ok(d) => d,
@@ -117,7 +165,7 @@ body { font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', sans-serif; b
h1 { text-align: center; margin-bottom: 20px; color: #e94560; }
.summary { text-align: center; margin-bottom: 20px; color: #aaa; }
.group { background: #16213e; border-radius: 12px; padding: 20px; margin-bottom: 24px; border: 1px solid #0f3460; }
.group-header { display: flex; justify-content: space-between; align-items: center; margin-bottom: 16px; }
.group-header { display: flex; justify-content: space-between; align-items: center; margin-bottom: 16px; flex-wrap: wrap; gap: 8px; }
.group-title { font-size: 1.2em; font-weight: bold; }
.badge { padding: 4px 12px; border-radius: 20px; font-size: 0.85em; font-weight: bold; }
.badge-exact { background: #e94560; color: white; }
@@ -135,10 +183,13 @@ h1 { text-align: center; margin-bottom: 20px; color: #e94560; }
.btn-delete:hover { background: #c73650; }
.btn-done { background: #533483; color: white; }
.btn-done:hover { background: #3d2660; }
.btn-all { background: #0f3460; color: #eee; border: 1px solid #533483; }
.btn-all { background: #0f3460; color: #eee; border: 1px solid #533483; font-size: 0.85em; padding: 6px 14px; }
.btn-all:hover { background: #16213e; }
.btn-ignore { background: #2d6a4f; color: white; font-size: 0.85em; padding: 6px 14px; }
.btn-ignore:hover { background: #1b4332; }
.status { text-align: center; margin-top: 16px; font-size: 1.1em; color: #e94560; }
.deleted { opacity: 0.3; pointer-events: none; }
.ignored { opacity: 0.2; pointer-events: none; }
</style>
</head>
<body>
@@ -147,7 +198,7 @@ h1 { text-align: center; margin-bottom: 20px; color: #e94560; }
let total_files: usize = groups.iter().map(|g| g.paths.len()).sum();
html.push_str(&format!(
"<p class=\"summary\">{} groups, {} files — select files to delete</p>\n",
"<p class=\"summary\">{} groups, {} files — select files to delete or dismiss false positives</p>\n",
groups.len(),
total_files
));
@@ -168,6 +219,7 @@ h1 { text-align: center; margin-bottom: 20px; color: #e94560; }
<span class="group-title">Group {}</span>
<span class="badge {badge_class}">{kind_str}</span>
<button class="btn btn-all" onclick="selectAllBut('{idx}')">Keep first, select rest</button>
<button class="btn btn-ignore" onclick="ignoreGroup({idx})">👁️ Not a dupe</button>
</div>
<div class="images">
"#,
@@ -215,6 +267,26 @@ function selectAllBut(groupId) {
updateCount();
}
async function ignoreGroup(groupIdx) {
try {
const res = await fetch('/ignore', {
method: 'POST',
headers: {'Content-Type': 'application/json'},
body: JSON.stringify(groupIdx)
});
const data = await res.json();
if (data.ok) {
const el = document.getElementById('group-' + groupIdx);
if (el) el.classList.add('ignored');
document.getElementById('status').textContent = 'Group ' + (groupIdx+1) + ' dismissed — won\'t appear next run';
} else {
document.getElementById('status').textContent = 'Error: ' + data.message;
}
} catch(e) {
document.getElementById('status').textContent = 'Error: ' + e.message;
}
}
async function deleteSelected() {
const checks = document.querySelectorAll('.del-check:checked');
if (checks.length === 0) { alert('No files selected'); return; }