From e1f8201b5cff0acba4ee8bc508f61396dfb87068 Mon Sep 17 00:00:00 2001 From: admin Date: Mon, 27 Apr 2026 23:33:27 +0000 Subject: [PATCH] feat: complete image phase - SHA-256 exact + dHash perceptual duplicate detection - lib.rs: scan_images, compute_dhash, hamming, find_duplicate_groups - main.rs: CLI with folder arg and optional hamming threshold - 13 unit tests: hamming, is_image_path, dhash, find_duplicate_groups - 7 integration tests: real files, empty dir, cropped, non-image exclusion, subdirectory recursion, single file, CLI binary output - All 20 tests passing --- .gitignore | 1 + Cargo.lock | 1101 ++++++++++++++++++++++++++++++++++++++++++ Cargo.toml | 4 + src/lib.rs | 286 +++++++++++ src/main.rs | 41 +- tests/image_phase.rs | 124 +++++ 6 files changed, 1556 insertions(+), 1 deletion(-) create mode 100644 Cargo.lock create mode 100644 src/lib.rs create mode 100644 tests/image_phase.rs diff --git a/.gitignore b/.gitignore index ea8c4bf..bc6e7ea 100644 --- a/.gitignore +++ b/.gitignore @@ -1 +1,2 @@ /target +.ssh/ diff --git a/Cargo.lock b/Cargo.lock new file mode 100644 index 0000000..c431661 --- /dev/null +++ b/Cargo.lock @@ -0,0 +1,1101 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +version = 4 + +[[package]] +name = "adler2" +version = "2.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "320119579fcad9c21884f5c4861d16174d0e06250625266f50fe6898340abefa" + +[[package]] +name = "aligned" +version = "0.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ee4508988c62edf04abd8d92897fca0c2995d907ce1dfeaf369dac3716a40685" +dependencies = [ + "as-slice", +] + +[[package]] +name = "aligned-vec" +version = "0.6.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dc890384c8602f339876ded803c97ad529f3842aba97f6392b3dba0dd171769b" +dependencies = [ + "equator", +] + +[[package]] +name = "anyhow" +version = "1.0.102" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7f202df86484c868dbad7eaa557ef785d5c66295e41b460ef922eca0723b842c" + +[[package]] +name = "arbitrary" +version = "1.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c3d036a3c4ab069c7b410a2ce876bd74808d2d0888a82667669f8e783a898bf1" + +[[package]] +name = "arg_enum_proc_macro" +version = "0.3.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0ae92a5119aa49cdbcf6b9f893fe4e1d98b04ccbf82ee0584ad948a44a734dea" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "arrayvec" +version = "0.7.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7c02d123df017efcdfbd739ef81735b36c5ba83ec3c59c80a9d7ecc718f92e50" + +[[package]] +name = "as-slice" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "516b6b4f0e40d50dcda9365d53964ec74560ad4284da2e7fc97122cd83174516" +dependencies = [ + "stable_deref_trait", +] + +[[package]] +name = "autocfg" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c08606f8c3cbf4ce6ec8e28fb0014a2c086708fe954eaa885384a6165172e7e8" + +[[package]] +name = "av-scenechange" +version = "0.14.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0f321d77c20e19b92c39e7471cf986812cbb46659d2af674adc4331ef3f18394" +dependencies = [ + "aligned", + "anyhow", + "arg_enum_proc_macro", + "arrayvec", + "log", + "num-rational", + "num-traits", + "pastey", + "rayon", + "thiserror", + "v_frame", + "y4m", +] + +[[package]] +name = "av1-grain" +version = "0.2.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8cfddb07216410377231960af4fcab838eaa12e013417781b78bd95ee22077f8" +dependencies = [ + "anyhow", + "arrayvec", + "log", + "nom", + "num-rational", + "v_frame", +] + +[[package]] +name = "avif-serialize" +version = "0.8.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "375082f007bd67184fb9c0374614b29f9aaa604ec301635f72338bb65386a53d" +dependencies = [ + "arrayvec", +] + +[[package]] +name = "bit_field" +version = "0.10.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e4b40c7323adcfc0a41c4b88143ed58346ff65a288fc144329c5c45e05d70c6" + +[[package]] +name = "bitflags" +version = "2.11.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c4512299f36f043ab09a583e57bceb5a5aab7a73db1805848e8fef3c9e8c78b3" + +[[package]] +name = "bitstream-io" +version = "4.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7eff00be299a18769011411c9def0d827e8f2d7bf0c3dbf53633147a8867fd1f" +dependencies = [ + "no_std_io2", +] + +[[package]] +name = "block-buffer" +version = "0.10.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3078c7629b62d3f0439517fa394996acacc5cbc91c5a20d8c658e77abd503a71" +dependencies = [ + "generic-array", +] + +[[package]] +name = "built" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f4ad8f11f288f48ca24471bbd51ac257aaeaaa07adae295591266b792902ae64" + +[[package]] +name = "bumpalo" +version = "3.20.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5d20789868f4b01b2f2caec9f5c4e0213b41e3e5702a50157d699ae31ced2fcb" + +[[package]] +name = "bytemuck" +version = "1.25.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c8efb64bd706a16a1bdde310ae86b351e4d21550d98d056f22f8a7f7a2183fec" + +[[package]] +name = "byteorder-lite" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8f1fe948ff07f4bd06c30984e69f5b4899c516a3ef74f34df92a2df2ab535495" + +[[package]] +name = "cc" +version = "1.2.61" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d16d90359e986641506914ba71350897565610e87ce0ad9e6f28569db3dd5c6d" +dependencies = [ + "find-msvc-tools", + "jobserver", + "libc", + "shlex", +] + +[[package]] +name = "cfg-if" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9330f8b2ff13f34540b44e946ef35111825727b38d33286ef986142615121801" + +[[package]] +name = "color_quant" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3d7b894f5411737b7867f4827955924d7c254fc9f4d91a6aad6b097804b1018b" + +[[package]] +name = "cpufeatures" +version = "0.2.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "59ed5838eebb26a2bb2e58f6d5b5316989ae9d08bab10e0e6d103e656d1b0280" +dependencies = [ + "libc", +] + +[[package]] +name = "crc32fast" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9481c1c90cbf2ac953f07c8d4a58aa3945c425b7185c9154d67a65e4230da511" +dependencies = [ + "cfg-if", +] + +[[package]] +name = "crossbeam-deque" +version = "0.8.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9dd111b7b7f7d55b72c0a6ae361660ee5853c9af73f70c3c2ef6858b950e2e51" +dependencies = [ + "crossbeam-epoch", + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-epoch" +version = "0.9.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5b82ac4a3c2ca9c3460964f020e1402edd5753411d7737aa39c3714ad1b5420e" +dependencies = [ + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-utils" +version = "0.8.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28" + +[[package]] +name = "crunchy" +version = "0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "460fbee9c2c2f33933d720630a6a0bac33ba7053db5344fac858d4b8952d77d5" + +[[package]] +name = "crypto-common" +version = "0.1.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "78c8292055d1c1df0cce5d180393dc8cce0abec0a7102adb6c7b1eef6016d60a" +dependencies = [ + "generic-array", + "typenum", +] + +[[package]] +name = "deduper" +version = "0.1.0" +dependencies = [ + "anyhow", + "image", + "sha2", + "walkdir", +] + +[[package]] +name = "digest" +version = "0.10.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9ed9a281f7bc9b7576e61468ba615a66a5c8cfdff42420a70aa82701a3b1e292" +dependencies = [ + "block-buffer", + "crypto-common", +] + +[[package]] +name = "either" +version = "1.15.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "48c757948c5ede0e46177b7add2e67155f70e33c07fea8284df6576da70b3719" + +[[package]] +name = "equator" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4711b213838dfee0117e3be6ac926007d7f433d7bbe33595975d4190cb07e6fc" +dependencies = [ + "equator-macro", +] + +[[package]] +name = "equator-macro" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "44f23cf4b44bfce11a86ace86f8a73ffdec849c9fd00a386a53d278bd9e81fb3" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "exr" +version = "1.74.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4300e043a56aa2cb633c01af81ca8f699a321879a7854d3896a0ba89056363be" +dependencies = [ + "bit_field", + "half", + "lebe", + "miniz_oxide", + "rayon-core", + "smallvec", + "zune-inflate", +] + +[[package]] +name = "fax" +version = "0.2.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "caf1079563223d5d59d83c85886a56e586cfd5c1a26292e971a0fa266531ac5a" + +[[package]] +name = "fdeflate" +version = "0.3.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e6853b52649d4ac5c0bd02320cddc5ba956bdb407c4b75a2c6b75bf51500f8c" +dependencies = [ + "simd-adler32", +] + +[[package]] +name = "find-msvc-tools" +version = "0.1.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5baebc0774151f905a1a2cc41989300b1e6fbb29aff0ceffa1064fdd3088d582" + +[[package]] +name = "flate2" +version = "1.1.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "843fba2746e448b37e26a819579957415c8cef339bf08564fe8b7ddbd959573c" +dependencies = [ + "crc32fast", + "miniz_oxide", +] + +[[package]] +name = "generic-array" +version = "0.14.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "85649ca51fd72272d7821adaf274ad91c288277713d9c18820d8499a7ff69e9a" +dependencies = [ + "typenum", + "version_check", +] + +[[package]] +name = "getrandom" +version = "0.3.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "899def5c37c4fd7b2664648c28120ecec138e4d395b459e5ca34f9cce2dd77fd" +dependencies = [ + "cfg-if", + "libc", + "r-efi", + "wasip2", +] + +[[package]] +name = "gif" +version = "0.14.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ee8cfcc411d9adbbaba82fb72661cc1bcca13e8bba98b364e62b2dba8f960159" +dependencies = [ + "color_quant", + "weezl", +] + +[[package]] +name = "half" +version = "2.7.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6ea2d84b969582b4b1864a92dc5d27cd2b77b622a8d79306834f1be5ba20d84b" +dependencies = [ + "cfg-if", + "crunchy", + "zerocopy", +] + +[[package]] +name = "image" +version = "0.25.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "85ab80394333c02fe689eaf900ab500fbd0c2213da414687ebf995a65d5a6104" +dependencies = [ + "bytemuck", + "byteorder-lite", + "color_quant", + "exr", + "gif", + "image-webp", + "moxcms", + "num-traits", + "png", + "qoi", + "ravif", + "rayon", + "rgb", + "tiff", + "zune-core", + "zune-jpeg", +] + +[[package]] +name = "image-webp" +version = "0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "525e9ff3e1a4be2fbea1fdf0e98686a6d98b4d8f937e1bf7402245af1909e8c3" +dependencies = [ + "byteorder-lite", + "quick-error", +] + +[[package]] +name = "imgref" +version = "1.12.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e7c5cedc30da3a610cac6b4ba17597bdf7152cf974e8aab3afb3d54455e371c8" + +[[package]] +name = "interpolate_name" +version = "0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c34819042dc3d3971c46c2190835914dfbe0c3c13f61449b2997f4e9722dfa60" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "itertools" +version = "0.14.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2b192c782037fadd9cfa75548310488aabdbf3d2da73885b31bd0abd03351285" +dependencies = [ + "either", +] + +[[package]] +name = "jobserver" +version = "0.1.34" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9afb3de4395d6b3e67a780b6de64b51c978ecf11cb9a462c66be7d4ca9039d33" +dependencies = [ + "getrandom", + "libc", +] + +[[package]] +name = "lebe" +version = "0.5.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7a79a3332a6609480d7d0c9eab957bca6b455b91bb84e66d19f5ff66294b85b8" + +[[package]] +name = "libc" +version = "0.2.186" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "68ab91017fe16c622486840e4c83c9a37afeff978bd239b5293d61ece587de66" + +[[package]] +name = "libfuzzer-sys" +version = "0.4.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f12a681b7dd8ce12bff52488013ba614b869148d54dd79836ab85aafdd53f08d" +dependencies = [ + "arbitrary", + "cc", +] + +[[package]] +name = "log" +version = "0.4.29" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5e5032e24019045c762d3c0f28f5b6b8bbf38563a65908389bf7978758920897" + +[[package]] +name = "loop9" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0fae87c125b03c1d2c0150c90365d7d6bcc53fb73a9acaef207d2d065860f062" +dependencies = [ + "imgref", +] + +[[package]] +name = "maybe-rayon" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8ea1f30cedd69f0a2954655f7188c6a834246d2bcf1e315e2ac40c4b24dc9519" +dependencies = [ + "cfg-if", + "rayon", +] + +[[package]] +name = "memchr" +version = "2.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f8ca58f447f06ed17d5fc4043ce1b10dd205e060fb3ce5b979b8ed8e59ff3f79" + +[[package]] +name = "miniz_oxide" +version = "0.8.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1fa76a2c86f704bdb222d66965fb3d63269ce38518b83cb0575fca855ebb6316" +dependencies = [ + "adler2", + "simd-adler32", +] + +[[package]] +name = "moxcms" +version = "0.8.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bb85c154ba489f01b25c0d36ae69a87e4a1c73a72631fc6c0eb6dde34a73e44b" +dependencies = [ + "num-traits", + "pxfm", +] + +[[package]] +name = "new_debug_unreachable" +version = "1.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "650eef8c711430f1a879fdd01d4745a7deea475becfb90269c06775983bbf086" + +[[package]] +name = "no_std_io2" +version = "0.9.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b51ed7824b6e07d354605f4abb3d9d300350701299da96642ee084f5ce631550" +dependencies = [ + "memchr", +] + +[[package]] +name = "nom" +version = "8.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "df9761775871bdef83bee530e60050f7e54b1105350d6884eb0fb4f46c2f9405" +dependencies = [ + "memchr", +] + +[[package]] +name = "noop_proc_macro" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0676bb32a98c1a483ce53e500a81ad9c3d5b3f7c920c28c24e9cb0980d0b5bc8" + +[[package]] +name = "num-bigint" +version = "0.4.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a5e44f723f1133c9deac646763579fdb3ac745e418f2a7af9cd0c431da1f20b9" +dependencies = [ + "num-integer", + "num-traits", +] + +[[package]] +name = "num-derive" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ed3955f1a9c7c0c15e092f9c887db08b1fc683305fdf6eb6684f22555355e202" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "num-integer" +version = "0.1.46" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7969661fd2958a5cb096e56c8e1ad0444ac2bbcd0061bd28660485a44879858f" +dependencies = [ + "num-traits", +] + +[[package]] +name = "num-rational" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f83d14da390562dca69fc84082e73e548e1ad308d24accdedd2720017cb37824" +dependencies = [ + "num-bigint", + "num-integer", + "num-traits", +] + +[[package]] +name = "num-traits" +version = "0.2.19" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "071dfc062690e90b734c0b2273ce72ad0ffa95f0c74596bc250dcfd960262841" +dependencies = [ + "autocfg", +] + +[[package]] +name = "once_cell" +version = "1.21.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9f7c3e4beb33f85d45ae3e3a1792185706c8e16d043238c593331cc7cd313b50" + +[[package]] +name = "paste" +version = "1.0.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "57c0d7b74b563b49d38dae00a0c37d4d6de9b432382b2892f0574ddcae73fd0a" + +[[package]] +name = "pastey" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "35fb2e5f958ec131621fdd531e9fc186ed768cbe395337403ae56c17a74c68ec" + +[[package]] +name = "png" +version = "0.18.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "60769b8b31b2a9f263dae2776c37b1b28ae246943cf719eb6946a1db05128a61" +dependencies = [ + "bitflags", + "crc32fast", + "fdeflate", + "flate2", + "miniz_oxide", +] + +[[package]] +name = "ppv-lite86" +version = "0.2.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "85eae3c4ed2f50dcfe72643da4befc30deadb458a9b590d720cde2f2b1e97da9" +dependencies = [ + "zerocopy", +] + +[[package]] +name = "proc-macro2" +version = "1.0.106" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8fd00f0bb2e90d81d1044c2b32617f68fcb9fa3bb7640c23e9c748e53fb30934" +dependencies = [ + "unicode-ident", +] + +[[package]] +name = "profiling" +version = "1.0.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3eb8486b569e12e2c32ad3e204dbaba5e4b5b216e9367044f25f1dba42341773" +dependencies = [ + "profiling-procmacros", +] + +[[package]] +name = "profiling-procmacros" +version = "1.0.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "52717f9a02b6965224f95ca2a81e2e0c5c43baacd28ca057577988930b6c3d5b" +dependencies = [ + "quote", + "syn", +] + +[[package]] +name = "pxfm" +version = "0.1.29" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e0c5ccf5294c6ccd63a74f1565028353830a9c2f5eb0c682c355c471726a6e3f" + +[[package]] +name = "qoi" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7f6d64c71eb498fe9eae14ce4ec935c555749aef511cca85b5568910d6e48001" +dependencies = [ + "bytemuck", +] + +[[package]] +name = "quick-error" +version = "2.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a993555f31e5a609f617c12db6250dedcac1b0a85076912c436e6fc9b2c8e6a3" + +[[package]] +name = "quote" +version = "1.0.45" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "41f2619966050689382d2b44f664f4bc593e129785a36d6ee376ddf37259b924" +dependencies = [ + "proc-macro2", +] + +[[package]] +name = "r-efi" +version = "5.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "69cdb34c158ceb288df11e18b4bd39de994f6657d83847bdffdbd7f346754b0f" + +[[package]] +name = "rand" +version = "0.9.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "44c5af06bb1b7d3216d91932aed5265164bf384dc89cd6ba05cf59a35f5f76ea" +dependencies = [ + "rand_chacha", + "rand_core", +] + +[[package]] +name = "rand_chacha" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d3022b5f1df60f26e1ffddd6c66e8aa15de382ae63b3a0c1bfc0e4d3e3f325cb" +dependencies = [ + "ppv-lite86", + "rand_core", +] + +[[package]] +name = "rand_core" +version = "0.9.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "76afc826de14238e6e8c374ddcc1fa19e374fd8dd986b0d2af0d02377261d83c" +dependencies = [ + "getrandom", +] + +[[package]] +name = "rav1e" +version = "0.8.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "43b6dd56e85d9483277cde964fd1bdb0428de4fec5ebba7540995639a21cb32b" +dependencies = [ + "aligned-vec", + "arbitrary", + "arg_enum_proc_macro", + "arrayvec", + "av-scenechange", + "av1-grain", + "bitstream-io", + "built", + "cfg-if", + "interpolate_name", + "itertools", + "libc", + "libfuzzer-sys", + "log", + "maybe-rayon", + "new_debug_unreachable", + "noop_proc_macro", + "num-derive", + "num-traits", + "paste", + "profiling", + "rand", + "rand_chacha", + "simd_helpers", + "thiserror", + "v_frame", + "wasm-bindgen", +] + +[[package]] +name = "ravif" +version = "0.13.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e52310197d971b0f5be7fe6b57530dcd27beb35c1b013f29d66c1ad73fbbcc45" +dependencies = [ + "avif-serialize", + "imgref", + "loop9", + "quick-error", + "rav1e", + "rayon", + "rgb", +] + +[[package]] +name = "rayon" +version = "1.12.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fb39b166781f92d482534ef4b4b1b2568f42613b53e5b6c160e24cfbfa30926d" +dependencies = [ + "either", + "rayon-core", +] + +[[package]] +name = "rayon-core" +version = "1.13.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "22e18b0f0062d30d4230b2e85ff77fdfe4326feb054b9783a3460d8435c8ab91" +dependencies = [ + "crossbeam-deque", + "crossbeam-utils", +] + +[[package]] +name = "rgb" +version = "0.8.53" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "47b34b781b31e5d73e9fbc8689c70551fd1ade9a19e3e28cfec8580a79290cc4" + +[[package]] +name = "rustversion" +version = "1.0.22" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b39cdef0fa800fc44525c84ccb54a029961a8215f9619753635a9c0d2538d46d" + +[[package]] +name = "same-file" +version = "1.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "93fc1dc3aaa9bfed95e02e6eadabb4baf7e3078b0bd1b4d7b6b0b68378900502" +dependencies = [ + "winapi-util", +] + +[[package]] +name = "sha2" +version = "0.10.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a7507d819769d01a365ab707794a4084392c824f54a7a6a7862f8c3d0892b283" +dependencies = [ + "cfg-if", + "cpufeatures", + "digest", +] + +[[package]] +name = "shlex" +version = "1.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64" + +[[package]] +name = "simd-adler32" +version = "0.3.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "703d5c7ef118737c72f1af64ad2f6f8c5e1921f818cdcb97b8fe6fc69bf66214" + +[[package]] +name = "simd_helpers" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "95890f873bec569a0362c235787f3aca6e1e887302ba4840839bcc6459c42da6" +dependencies = [ + "quote", +] + +[[package]] +name = "smallvec" +version = "1.15.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "67b1b7a3b5fe4f1376887184045fcf45c69e92af734b7aaddc05fb777b6fbd03" + +[[package]] +name = "stable_deref_trait" +version = "1.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6ce2be8dc25455e1f91df71bfa12ad37d7af1092ae736f3a6cd0e37bc7810596" + +[[package]] +name = "syn" +version = "2.0.117" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e665b8803e7b1d2a727f4023456bbbbe74da67099c585258af0ad9c5013b9b99" +dependencies = [ + "proc-macro2", + "quote", + "unicode-ident", +] + +[[package]] +name = "thiserror" +version = "2.0.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4288b5bcbc7920c07a1149a35cf9590a2aa808e0bc1eafaade0b80947865fbc4" +dependencies = [ + "thiserror-impl", +] + +[[package]] +name = "thiserror-impl" +version = "2.0.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ebc4ee7f67670e9b64d05fa4253e753e016c6c95ff35b89b7941d6b856dec1d5" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "tiff" +version = "0.11.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b63feaf3343d35b6ca4d50483f94843803b0f51634937cc2ec519fc32232bc52" +dependencies = [ + "fax", + "flate2", + "half", + "quick-error", + "weezl", + "zune-jpeg", +] + +[[package]] +name = "typenum" +version = "1.20.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "40ce102ab67701b8526c123c1bab5cbe42d7040ccfd0f64af1a385808d2f43de" + +[[package]] +name = "unicode-ident" +version = "1.0.24" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e6e4313cd5fcd3dad5cafa179702e2b244f760991f45397d14d4ebf38247da75" + +[[package]] +name = "v_frame" +version = "0.3.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "666b7727c8875d6ab5db9533418d7c764233ac9c0cff1d469aec8fa127597be2" +dependencies = [ + "aligned-vec", + "num-traits", + "wasm-bindgen", +] + +[[package]] +name = "version_check" +version = "0.9.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0b928f33d975fc6ad9f86c8f283853ad26bdd5b10b7f1542aa2fa15e2289105a" + +[[package]] +name = "walkdir" +version = "2.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "29790946404f91d9c5d06f9874efddea1dc06c5efe94541a7d6863108e3a5e4b" +dependencies = [ + "same-file", + "winapi-util", +] + +[[package]] +name = "wasip2" +version = "1.0.3+wasi-0.2.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "20064672db26d7cdc89c7798c48a0fdfac8213434a1186e5ef29fd560ae223d6" +dependencies = [ + "wit-bindgen", +] + +[[package]] +name = "wasm-bindgen" +version = "0.2.118" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0bf938a0bacb0469e83c1e148908bd7d5a6010354cf4fb73279b7447422e3a89" +dependencies = [ + "cfg-if", + "once_cell", + "rustversion", + "wasm-bindgen-macro", + "wasm-bindgen-shared", +] + +[[package]] +name = "wasm-bindgen-macro" +version = "0.2.118" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "eeff24f84126c0ec2db7a449f0c2ec963c6a49efe0698c4242929da037ca28ed" +dependencies = [ + "quote", + "wasm-bindgen-macro-support", +] + +[[package]] +name = "wasm-bindgen-macro-support" +version = "0.2.118" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9d08065faf983b2b80a79fd87d8254c409281cf7de75fc4b773019824196c904" +dependencies = [ + "bumpalo", + "proc-macro2", + "quote", + "syn", + "wasm-bindgen-shared", +] + +[[package]] +name = "wasm-bindgen-shared" +version = "0.2.118" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5fd04d9e306f1907bd13c6361b5c6bfc7b3b3c095ed3f8a9246390f8dbdee129" +dependencies = [ + "unicode-ident", +] + +[[package]] +name = "weezl" +version = "0.1.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a28ac98ddc8b9274cb41bb4d9d4d5c425b6020c50c46f25559911905610b4a88" + +[[package]] +name = "winapi-util" +version = "0.1.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c2a7b1c03c876122aa43f3020e6c3c3ee5c05081c9a00739faf7503aeba10d22" +dependencies = [ + "windows-sys", +] + +[[package]] +name = "windows-link" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f0805222e57f7521d6a62e36fa9163bc891acd422f971defe97d64e70d0a4fe5" + +[[package]] +name = "windows-sys" +version = "0.61.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ae137229bcbd6cdf0f7b80a31df61766145077ddf49416a728b02cb3921ff3fc" +dependencies = [ + "windows-link", +] + +[[package]] +name = "wit-bindgen" +version = "0.57.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1ebf944e87a7c253233ad6766e082e3cd714b5d03812acc24c318f549614536e" + +[[package]] +name = "y4m" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7a5a4b21e1a62b67a2970e6831bc091d7b87e119e7f9791aef9702e3bef04448" + +[[package]] +name = "zerocopy" +version = "0.8.48" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "eed437bf9d6692032087e337407a86f04cd8d6a16a37199ed57949d415bd68e9" +dependencies = [ + "zerocopy-derive", +] + +[[package]] +name = "zerocopy-derive" +version = "0.8.48" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "70e3cd084b1788766f53af483dd21f93881ff30d7320490ec3ef7526d203bad4" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "zune-core" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cb8a0807f7c01457d0379ba880ba6322660448ddebc890ce29bb64da71fb40f9" + +[[package]] +name = "zune-inflate" +version = "0.2.54" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "73ab332fe2f6680068f3582b16a24f90ad7096d5d39b974d1c0aff0125116f02" +dependencies = [ + "simd-adler32", +] + +[[package]] +name = "zune-jpeg" +version = "0.5.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "27bc9d5b815bc103f142aa054f561d9187d191692ec7c2d1e2b4737f8dbd7296" +dependencies = [ + "zune-core", +] diff --git a/Cargo.toml b/Cargo.toml index 896ad10..f61c10c 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -4,3 +4,7 @@ version = "0.1.0" edition = "2024" [dependencies] +image = { version = "0.25", default-features = true, features = ["jpeg", "png", "gif", "webp", "bmp", "tiff"] } +sha2 = "0.10" +walkdir = "2.5" +anyhow = "1" diff --git a/src/lib.rs b/src/lib.rs new file mode 100644 index 0000000..28bd9ce --- /dev/null +++ b/src/lib.rs @@ -0,0 +1,286 @@ +use anyhow::Result; +use image::imageops::FilterType; +use sha2::{Digest, Sha256}; +use std::collections::{HashMap, HashSet}; +use std::fs; +use std::path::{Path, PathBuf}; +use walkdir::WalkDir; + +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct ImageEntry { + pub path: PathBuf, + pub sha256: String, + pub dhash: u64, +} + +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct DuplicateGroup { + pub kind: DuplicateKind, + pub paths: Vec, +} + +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum DuplicateKind { + Exact, + Similar, +} + +fn is_image_path(path: &Path) -> bool { + path.extension() + .and_then(|e| e.to_str()) + .map(|e| matches!(e.to_ascii_lowercase().as_str(), "jpg" | "jpeg" | "png" | "webp" | "bmp" | "gif" | "tif" | "tiff")) + .unwrap_or(false) +} + +pub fn scan_images(root: &Path) -> Result> { + let mut out = Vec::new(); + for entry in WalkDir::new(root).follow_links(true) { + let entry = entry?; + let path = entry.path(); + if !entry.file_type().is_file() || !is_image_path(path) { + continue; + } + let bytes = fs::read(path)?; + let sha256 = format!("{:x}", Sha256::digest(&bytes)); + let img = image::open(path)?; + let dhash = compute_dhash(&img); + out.push(ImageEntry { + path: path.to_path_buf(), + sha256, + dhash, + }); + } + Ok(out) +} + +pub fn compute_dhash(img: &image::DynamicImage) -> u64 { + let gray = img + .grayscale() + .resize_exact(9, 8, FilterType::Triangle) + .to_luma8(); + let mut hash = 0u64; + let mut bit = 0; + for y in 0..8 { + for x in 0..8 { + let left = gray.get_pixel(x, y)[0]; + let right = gray.get_pixel(x + 1, y)[0]; + if left > right { + hash |= 1 << bit; + } + bit += 1; + } + } + hash +} + +pub fn hamming(a: u64, b: u64) -> u32 { + (a ^ b).count_ones() +} + +pub fn find_duplicate_groups(entries: &[ImageEntry], hamming_threshold: u32) -> Vec { + let mut groups = Vec::new(); + + let mut exact: HashMap<&str, Vec> = HashMap::new(); + for e in entries { + exact.entry(&e.sha256).or_default().push(e.path.clone()); + } + for paths in exact.into_values() { + if paths.len() > 1 { + groups.push(DuplicateGroup { + kind: DuplicateKind::Exact, + paths, + }); + } + } + + let n = entries.len(); + let mut parent: Vec = (0..n).collect(); + + fn find(parent: &mut [usize], x: usize) -> usize { + if parent[x] != x { + let p = parent[x]; + parent[x] = find(parent, p); + } + parent[x] + } + + fn union(parent: &mut [usize], a: usize, b: usize) { + let ra = find(parent, a); + let rb = find(parent, b); + if ra != rb { + parent[rb] = ra; + } + } + + for i in 0..n { + for j in (i + 1)..n { + if entries[i].sha256 == entries[j].sha256 { + continue; + } + if hamming(entries[i].dhash, entries[j].dhash) <= hamming_threshold { + union(&mut parent, i, j); + } + } + } + + let mut similar: HashMap> = HashMap::new(); + for (idx, e) in entries.iter().enumerate() { + let root = find(&mut parent, idx); + similar.entry(root).or_default().push(e.path.clone()); + } + + let exact_paths: HashSet = groups + .iter() + .flat_map(|g| g.paths.iter().cloned()) + .collect(); + + for paths in similar.into_values() { + if paths.len() > 1 { + let non_exact = paths.iter().filter(|p| !exact_paths.contains(*p)).count(); + if non_exact >= 2 { + groups.push(DuplicateGroup { + kind: DuplicateKind::Similar, + paths, + }); + } + } + } + + groups +} + +#[cfg(test)] +mod tests { + use super::*; + use std::path::PathBuf; + + #[test] + fn hamming_identical() { + assert_eq!(hamming(0, 0), 0); + assert_eq!(hamming(u64::MAX, u64::MAX), 0); + } + + #[test] + fn hamming_opposite() { + assert_eq!(hamming(0, u64::MAX), 64); + } + + #[test] + fn hamming_single_bit() { + assert_eq!(hamming(0b0000, 0b0001), 1); + assert_eq!(hamming(0b1010, 0b1000), 1); + } + + #[test] + fn is_image_path_accepts_valid_extensions() { + for ext in &["jpg", "jpeg", "png", "webp", "bmp", "gif", "tif", "tiff"] { + let p = PathBuf::from(format!("photo.{ext}")); + assert!(is_image_path(&p), "should accept .{ext}"); + } + } + + #[test] + fn is_image_path_case_insensitive() { + assert!(is_image_path(Path::new("photo.JPG"))); + assert!(is_image_path(Path::new("photo.Png"))); + } + + #[test] + fn is_image_path_rejects_non_image() { + assert!(!is_image_path(Path::new("file.txt"))); + assert!(!is_image_path(Path::new("file.mp3"))); + assert!(!is_image_path(Path::new("file.pdf"))); + assert!(!is_image_path(Path::new("noext"))); + } + + #[test] + fn dhash_deterministic() { + let img = image::DynamicImage::new_rgb8(100, 100); + let h1 = compute_dhash(&img); + let h2 = compute_dhash(&img); + assert_eq!(h1, h2); + } + + #[test] + fn dhash_solid_images_differ_from_gradient() { + // Solid black: all pixels 0 -> dhash = 0 (no left > right) + let black = image::DynamicImage::new_rgb8(64, 64); + // Gradient: right-to-left (bright left, dark right) -> left > right = true + let mut grad = image::RgbImage::new(64, 64); + for y in 0..64 { + for x in 0..64 { + let v = (255 - x * 255 / 63) as u8; + grad.put_pixel(x, y, image::Rgb([v, v, v])); + } + } + let grad = image::DynamicImage::ImageRgb8(grad); + let h_black = compute_dhash(&black); + let h_grad = compute_dhash(&grad); + assert_ne!(h_black, h_grad, "solid vs gradient hashes must differ"); + assert!(hamming(h_black, h_grad) > 8, "solid vs gradient should differ significantly"); + } + + fn make_entry(path: &str, sha: &str, dhash: u64) -> ImageEntry { + ImageEntry { + path: PathBuf::from(path), + sha256: sha.to_string(), + dhash, + } + } + + #[test] + fn find_groups_empty_input() { + let groups = find_duplicate_groups(&[], 8); + assert!(groups.is_empty()); + } + + #[test] + fn find_groups_no_duplicates() { + let entries = vec![ + make_entry("a.jpg", "aaa", 0), + make_entry("b.jpg", "bbb", u64::MAX), + ]; + let groups = find_duplicate_groups(&entries, 8); + assert!(groups.is_empty(), "different hash+dhash = no groups"); + } + + #[test] + fn find_groups_exact_only() { + let entries = vec![ + make_entry("a.jpg", "same", 100), + make_entry("b.jpg", "same", 100), + make_entry("c.jpg", "diff", 999), + ]; + let groups = find_duplicate_groups(&entries, 8); + assert_eq!(groups.iter().filter(|g| g.kind == DuplicateKind::Exact).count(), 1); + let exact = groups.iter().find(|g| g.kind == DuplicateKind::Exact).unwrap(); + assert_eq!(exact.paths.len(), 2); + } + + #[test] + fn find_groups_similar_only() { + let entries = vec![ + make_entry("a.jpg", "aaa", 0b0000_0000), + make_entry("b.jpg", "bbb", 0b0000_0011), // hamming=2 + make_entry("c.jpg", "ccc", u64::MAX), // far away + ]; + let groups = find_duplicate_groups(&entries, 8); + assert!(groups.iter().any(|g| g.kind == DuplicateKind::Similar), "should find similar pair"); + assert!(!groups.iter().any(|g| g.paths.iter().any(|p| p == Path::new("c.jpg")) + && g.paths.iter().any(|p| p == Path::new("a.jpg"))), "c.jpg should not group with a.jpg"); + } + + #[test] + fn find_groups_threshold_boundary() { + let entries = vec![ + make_entry("a.jpg", "aaa", 0), + make_entry("b.jpg", "bbb", 0b1111_1111), // hamming=8 + ]; + // threshold=8: should match + let groups8 = find_duplicate_groups(&entries, 8); + assert!(groups8.iter().any(|g| g.kind == DuplicateKind::Similar)); + // threshold=7: should NOT match + let groups7 = find_duplicate_groups(&entries, 7); + assert!(!groups7.iter().any(|g| g.kind == DuplicateKind::Similar)); + } +} diff --git a/src/main.rs b/src/main.rs index e7a11a9..53b9d31 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,3 +1,42 @@ +use deduper::{find_duplicate_groups, scan_images, DuplicateKind}; +use std::env; +use std::path::Path; + fn main() { - println!("Hello, world!"); + let args: Vec = env::args().collect(); + if args.len() < 2 { + eprintln!("usage: deduper [hamming-threshold]"); + std::process::exit(1); + } + + let root = Path::new(&args[1]); + let threshold = args + .get(2) + .and_then(|s| s.parse::().ok()) + .unwrap_or(8); + + let entries = match scan_images(root) { + Ok(v) => v, + Err(e) => { + eprintln!("scan error: {e}"); + std::process::exit(1); + } + }; + + let groups = find_duplicate_groups(&entries, threshold); + if groups.is_empty() { + println!("no image duplicates found"); + return; + } + + for (idx, group) in groups.iter().enumerate() { + let kind = match group.kind { + DuplicateKind::Exact => "exact", + DuplicateKind::Similar => "similar", + }; + println!("group {} [{}]", idx + 1, kind); + for path in &group.paths { + println!(" {}", path.display()); + } + } } diff --git a/tests/image_phase.rs b/tests/image_phase.rs new file mode 100644 index 0000000..2b3d0e3 --- /dev/null +++ b/tests/image_phase.rs @@ -0,0 +1,124 @@ +use deduper::{find_duplicate_groups, hamming, scan_images, DuplicateKind}; +use std::path::Path; + +fn fixture(name: &str) -> String { + format!("/a0/usr/projects/deduper/.a0proj/test_media/images/{name}") +} + +#[test] +fn image_phase_real_files_red_green() { + let dir = Path::new("/a0/usr/projects/deduper/.a0proj/test_media/images"); + let entries = scan_images(dir).expect("scan images"); + assert!(entries.len() >= 5, "need fixtures"); + + let orig = entries + .iter() + .find(|e| e.path == Path::new(&fixture("orig.jpg"))) + .unwrap(); + let copy = entries + .iter() + .find(|e| e.path == Path::new(&fixture("orig_copy.jpg"))) + .unwrap(); + let resized = entries + .iter() + .find(|e| e.path == Path::new(&fixture("orig_resized.jpg"))) + .unwrap(); + let blue = entries + .iter() + .find(|e| e.path == Path::new(&fixture("solid_blue.jpg"))) + .unwrap(); + + assert_eq!(orig.sha256, copy.sha256); + assert!(hamming(orig.dhash, resized.dhash) <= 8, "resized should be similar"); + assert!(hamming(orig.dhash, blue.dhash) > 8, "blue should be unrelated"); + + let groups = find_duplicate_groups(&entries, 8); + assert!(groups.iter().any(|g| { + g.kind == DuplicateKind::Exact + && g.paths.iter().any(|p| p.ends_with("orig.jpg")) + && g.paths.iter().any(|p| p.ends_with("orig_copy.jpg")) + }), "missing exact group"); + assert!(groups.iter().any(|g| { + g.kind == DuplicateKind::Similar + && g.paths.iter().any(|p| p.ends_with("orig.jpg")) + && g.paths.iter().any(|p| p.ends_with("orig_resized.jpg")) + }), "missing similar group"); + assert!(!groups.iter().any(|g| { + g.paths.iter().any(|p| p.ends_with("solid_blue.jpg")) + && g.paths.iter().any(|p| p.ends_with("orig.jpg")) + }), "false positive with unrelated image"); +} + +#[test] +fn scan_empty_dir_returns_no_entries() { + let dir = Path::new("/a0/usr/projects/deduper/.a0proj/test_media/images/empty_dir"); + let entries = scan_images(dir).expect("scan empty dir"); + assert!(entries.is_empty(), "empty dir should yield no entries"); +} + +#[test] +fn cropped_image_similar_to_original() { + let dir = Path::new("/a0/usr/projects/deduper/.a0proj/test_media/images"); + let entries = scan_images(dir).expect("scan"); + let orig = entries.iter().find(|e| e.path.ends_with("orig.jpg")).expect("orig.jpg"); + let cropped = entries.iter().find(|e| e.path.ends_with("orig_cropped.jpg")).expect("orig_cropped.jpg"); + assert_ne!(orig.sha256, cropped.sha256, "cropped should differ in bytes"); + assert!(hamming(orig.dhash, cropped.dhash) <= 12, "cropped should be perceptually similar (hamming <= 12)"); +} + +#[test] +fn non_image_files_excluded_from_scan() { + let dir = Path::new("/a0/usr/projects/deduper/.a0proj/test_media/images"); + let entries = scan_images(dir).expect("scan"); + // readme.txt and data.csv exist in dir but must not appear in results + assert!(!entries.iter().any(|e| e.path.ends_with("readme.txt")), "txt file should be excluded"); + assert!(!entries.iter().any(|e| e.path.ends_with("data.csv")), "csv file should be excluded"); + // all entries should have image extensions + for e in &entries { + let ext = e.path.extension().unwrap().to_str().unwrap().to_ascii_lowercase(); + assert!(matches!(ext.as_str(), "jpg"|"jpeg"|"png"|"webp"|"bmp"|"gif"|"tif"|"tiff"), + "unexpected ext: {ext} in {}", e.path.display()); + } +} + +#[test] +fn scan_recurses_into_subdirectories() { + let dir = Path::new("/a0/usr/projects/deduper/.a0proj/test_media/images"); + let entries = scan_images(dir).expect("scan"); + // gradient_sub.jpg lives in subdir/ - must be found + let sub = entries.iter().find(|e| e.path.ends_with("subdir/gradient_sub.jpg")); + assert!(sub.is_some(), "should find image in subdirectory"); + // it's an exact copy of gradient.jpg + let grad = entries.iter().find(|e| e.path.ends_with("gradient.jpg") && !e.path.to_str().unwrap().contains("subdir")).unwrap(); + let sub = sub.unwrap(); + assert_eq!(grad.sha256, sub.sha256, "exact copy should have same sha256"); + let groups = find_duplicate_groups(&entries, 8); + assert!(groups.iter().any(|g| { + g.kind == DuplicateKind::Exact + && g.paths.iter().any(|p| p.ends_with("gradient.jpg") && !p.to_str().unwrap().contains("subdir")) + && g.paths.iter().any(|p| p.ends_with("gradient_sub.jpg")) + }), "should group gradient.jpg and subdir/gradient_sub.jpg as exact"); +} + +#[test] +fn single_image_no_duplicates() { + let dir = Path::new("/a0/usr/projects/deduper/.a0proj/test_media/images/single"); + let entries = scan_images(dir).expect("scan"); + assert_eq!(entries.len(), 1, "should find exactly one image"); + let groups = find_duplicate_groups(&entries, 8); + assert!(groups.is_empty(), "single image should produce no duplicate groups"); +} + +#[test] +fn cli_binary_reports_duplicates() { + let bin = env!("CARGO_BIN_EXE_deduper"); + let output = std::process::Command::new(bin) + .arg("/a0/usr/projects/deduper/.a0proj/test_media/images") + .arg("8") + .output() + .expect("failed to run deduper binary"); + let stdout = String::from_utf8_lossy(&output.stdout); + assert!(output.status.success(), "binary should exit 0"); + assert!(stdout.contains("[exact]"), "output should contain exact groups: {stdout}"); + assert!(stdout.contains("[similar]"), "output should contain similar groups: {stdout}"); +}