Add server-side deduplication on ingest

This commit is contained in:
Agent Zero
2026-03-24 05:40:30 +00:00
parent 5d5c042dd1
commit 61d6448b44
8 changed files with 421 additions and 23 deletions

View File

@@ -12,6 +12,7 @@ pub struct Config {
pub database: DatabaseConfig,
pub embedding: EmbeddingConfig,
pub query: QueryConfig,
pub dedup: DedupConfig,
pub ttl: TtlConfig,
pub auth: AuthConfig,
}
@@ -56,6 +57,13 @@ pub struct QueryConfig {
pub text_weight: f32,
}
/// Deduplication configuration
#[derive(Debug, Clone, Deserialize)]
pub struct DedupConfig {
#[serde(default = "default_dedup_threshold")]
pub threshold: f32,
}
/// TTL / expiry configuration
#[derive(Debug, Clone, Deserialize)]
pub struct TtlConfig {
@@ -106,6 +114,7 @@ fn default_model_path() -> String { "models/all-MiniLM-L6-v2".to_string() }
fn default_embedding_dim() -> usize { 384 }
fn default_vector_weight() -> f32 { 0.6 }
fn default_text_weight() -> f32 { 0.4 }
fn default_dedup_threshold() -> f32 { 0.90 }
fn default_cleanup_interval_seconds() -> u64 { 300 }
fn default_auth_enabled() -> bool { false }
@@ -128,6 +137,8 @@ impl Config {
// Query settings
.set_default("query.vector_weight", default_vector_weight() as f64)?
.set_default("query.text_weight", default_text_weight() as f64)?
// Dedup settings
.set_default("dedup.threshold", default_dedup_threshold() as f64)?
// TTL settings
.set_default(
"ttl.cleanup_interval_seconds",
@@ -156,6 +167,11 @@ impl Config {
config.query.text_weight = parsed;
}
}
if let Ok(dedup_threshold) = std::env::var("DEDUP_THRESHOLD") {
if let Ok(parsed) = dedup_threshold.parse::<f32>() {
config.dedup.threshold = parsed;
}
}
Ok(config)
}
@@ -184,6 +200,9 @@ impl Default for Config {
vector_weight: default_vector_weight(),
text_weight: default_text_weight(),
},
dedup: DedupConfig {
threshold: default_dedup_threshold(),
},
ttl: TtlConfig {
cleanup_interval_seconds: default_cleanup_interval_seconds(),
},