mirror of
https://gitea.ingwaz.work/Ingwaz/openbrain-mcp.git
synced 2026-03-31 14:49:06 +00:00
Add server-side deduplication on ingest
This commit is contained in:
@@ -12,6 +12,7 @@ pub struct Config {
|
||||
pub database: DatabaseConfig,
|
||||
pub embedding: EmbeddingConfig,
|
||||
pub query: QueryConfig,
|
||||
pub dedup: DedupConfig,
|
||||
pub ttl: TtlConfig,
|
||||
pub auth: AuthConfig,
|
||||
}
|
||||
@@ -56,6 +57,13 @@ pub struct QueryConfig {
|
||||
pub text_weight: f32,
|
||||
}
|
||||
|
||||
/// Deduplication configuration
|
||||
#[derive(Debug, Clone, Deserialize)]
|
||||
pub struct DedupConfig {
|
||||
#[serde(default = "default_dedup_threshold")]
|
||||
pub threshold: f32,
|
||||
}
|
||||
|
||||
/// TTL / expiry configuration
|
||||
#[derive(Debug, Clone, Deserialize)]
|
||||
pub struct TtlConfig {
|
||||
@@ -106,6 +114,7 @@ fn default_model_path() -> String { "models/all-MiniLM-L6-v2".to_string() }
|
||||
fn default_embedding_dim() -> usize { 384 }
|
||||
fn default_vector_weight() -> f32 { 0.6 }
|
||||
fn default_text_weight() -> f32 { 0.4 }
|
||||
fn default_dedup_threshold() -> f32 { 0.90 }
|
||||
fn default_cleanup_interval_seconds() -> u64 { 300 }
|
||||
fn default_auth_enabled() -> bool { false }
|
||||
|
||||
@@ -128,6 +137,8 @@ impl Config {
|
||||
// Query settings
|
||||
.set_default("query.vector_weight", default_vector_weight() as f64)?
|
||||
.set_default("query.text_weight", default_text_weight() as f64)?
|
||||
// Dedup settings
|
||||
.set_default("dedup.threshold", default_dedup_threshold() as f64)?
|
||||
// TTL settings
|
||||
.set_default(
|
||||
"ttl.cleanup_interval_seconds",
|
||||
@@ -156,6 +167,11 @@ impl Config {
|
||||
config.query.text_weight = parsed;
|
||||
}
|
||||
}
|
||||
if let Ok(dedup_threshold) = std::env::var("DEDUP_THRESHOLD") {
|
||||
if let Ok(parsed) = dedup_threshold.parse::<f32>() {
|
||||
config.dedup.threshold = parsed;
|
||||
}
|
||||
}
|
||||
|
||||
Ok(config)
|
||||
}
|
||||
@@ -184,6 +200,9 @@ impl Default for Config {
|
||||
vector_weight: default_vector_weight(),
|
||||
text_weight: default_text_weight(),
|
||||
},
|
||||
dedup: DedupConfig {
|
||||
threshold: default_dedup_threshold(),
|
||||
},
|
||||
ttl: TtlConfig {
|
||||
cleanup_interval_seconds: default_cleanup_interval_seconds(),
|
||||
},
|
||||
|
||||
153
src/db.rs
153
src/db.rs
@@ -5,12 +5,12 @@
|
||||
use anyhow::{Context, Result};
|
||||
use deadpool_postgres::{Config, Pool, Runtime};
|
||||
use pgvector::Vector;
|
||||
use tokio_postgres::NoTls;
|
||||
use tokio_postgres::{GenericClient, NoTls};
|
||||
use tracing::info;
|
||||
use uuid::Uuid;
|
||||
|
||||
use serde::Serialize;
|
||||
use serde_json::Value;
|
||||
use serde_json::{Map, Value};
|
||||
use crate::config::DatabaseConfig;
|
||||
|
||||
/// Database wrapper with connection pool
|
||||
@@ -42,6 +42,69 @@ pub struct MemoryMatch {
|
||||
pub hybrid_score: f32,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct StoreMemoryResult {
|
||||
pub id: Uuid,
|
||||
pub deduplicated: bool,
|
||||
pub expires_at: Option<chrono::DateTime<chrono::Utc>>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
struct DedupMatch {
|
||||
id: Uuid,
|
||||
metadata: Value,
|
||||
expires_at: Option<chrono::DateTime<chrono::Utc>>,
|
||||
}
|
||||
|
||||
fn merge_metadata(existing: &Value, incoming: &Value) -> Value {
|
||||
match (existing, incoming) {
|
||||
(Value::Object(existing), Value::Object(incoming)) => {
|
||||
let mut merged = Map::with_capacity(existing.len() + incoming.len());
|
||||
for (key, value) in existing {
|
||||
merged.insert(key.clone(), value.clone());
|
||||
}
|
||||
for (key, value) in incoming {
|
||||
merged.insert(key.clone(), value.clone());
|
||||
}
|
||||
Value::Object(merged)
|
||||
}
|
||||
(_, Value::Null) => existing.clone(),
|
||||
_ => incoming.clone(),
|
||||
}
|
||||
}
|
||||
|
||||
async fn find_dedup_match<C>(
|
||||
client: &C,
|
||||
agent_id: &str,
|
||||
embedding: &Vector,
|
||||
threshold: f32,
|
||||
) -> Result<Option<DedupMatch>>
|
||||
where
|
||||
C: GenericClient + Sync,
|
||||
{
|
||||
let row = client
|
||||
.query_opt(
|
||||
r#"
|
||||
SELECT id, metadata, expires_at
|
||||
FROM memories
|
||||
WHERE agent_id = $1
|
||||
AND (expires_at IS NULL OR expires_at > NOW())
|
||||
AND (1 - (embedding <=> $2)) >= $3
|
||||
ORDER BY (1 - (embedding <=> $2)) DESC, created_at DESC
|
||||
LIMIT 1
|
||||
"#,
|
||||
&[&agent_id, embedding, &threshold],
|
||||
)
|
||||
.await
|
||||
.context("Failed to check for duplicate memory")?;
|
||||
|
||||
Ok(row.map(|row| DedupMatch {
|
||||
id: row.get("id"),
|
||||
metadata: row.get("metadata"),
|
||||
expires_at: row.get("expires_at"),
|
||||
}))
|
||||
}
|
||||
|
||||
impl Database {
|
||||
/// Create a new database connection pool
|
||||
pub async fn new(config: &DatabaseConfig) -> Result<Self> {
|
||||
@@ -77,11 +140,38 @@ impl Database {
|
||||
keywords: &[String],
|
||||
metadata: serde_json::Value,
|
||||
expires_at: Option<chrono::DateTime<chrono::Utc>>,
|
||||
) -> Result<Uuid> {
|
||||
dedup_threshold: f32,
|
||||
) -> Result<StoreMemoryResult> {
|
||||
let client = self.pool.get().await?;
|
||||
let id = Uuid::new_v4();
|
||||
let vector = Vector::from(embedding.to_vec());
|
||||
|
||||
if let Some(existing) = find_dedup_match(&client, agent_id, &vector, dedup_threshold).await? {
|
||||
let merged_metadata = merge_metadata(&existing.metadata, &metadata);
|
||||
let refreshed_expires_at = expires_at.or(existing.expires_at);
|
||||
|
||||
client
|
||||
.execute(
|
||||
r#"
|
||||
UPDATE memories
|
||||
SET metadata = $2,
|
||||
created_at = NOW(),
|
||||
expires_at = $3
|
||||
WHERE id = $1
|
||||
"#,
|
||||
&[&existing.id, &merged_metadata, &refreshed_expires_at],
|
||||
)
|
||||
.await
|
||||
.context("Failed to update deduplicated memory")?;
|
||||
|
||||
return Ok(StoreMemoryResult {
|
||||
id: existing.id,
|
||||
deduplicated: true,
|
||||
expires_at: refreshed_expires_at,
|
||||
});
|
||||
}
|
||||
|
||||
let id = Uuid::new_v4();
|
||||
|
||||
client
|
||||
.execute(
|
||||
r#"
|
||||
@@ -93,7 +183,11 @@ impl Database {
|
||||
.await
|
||||
.context("Failed to store memory")?;
|
||||
|
||||
Ok(id)
|
||||
Ok(StoreMemoryResult {
|
||||
id,
|
||||
deduplicated: false,
|
||||
expires_at,
|
||||
})
|
||||
}
|
||||
|
||||
/// Query memories by vector similarity
|
||||
@@ -257,6 +351,7 @@ impl Database {
|
||||
pub struct BatchStoreResult {
|
||||
pub id: String,
|
||||
pub status: String,
|
||||
pub deduplicated: bool,
|
||||
pub expires_at: Option<String>,
|
||||
}
|
||||
|
||||
@@ -272,23 +367,51 @@ impl Database {
|
||||
Vec<String>,
|
||||
Option<chrono::DateTime<chrono::Utc>>,
|
||||
)>,
|
||||
dedup_threshold: f32,
|
||||
) -> Result<Vec<BatchStoreResult>> {
|
||||
let mut client = self.pool.get().await?;
|
||||
let transaction = client.transaction().await?;
|
||||
let mut results = Vec::with_capacity(entries.len());
|
||||
|
||||
for (content, metadata, embedding, keywords, expires_at) in entries {
|
||||
let id = Uuid::new_v4();
|
||||
let vector = Vector::from(embedding);
|
||||
transaction.execute(
|
||||
r#"INSERT INTO memories (id, agent_id, content, embedding, keywords, metadata, expires_at) VALUES ($1, $2, $3, $4, $5, $6, $7)"#,
|
||||
&[&id, &agent_id, &content, &vector, &keywords, &metadata, &expires_at],
|
||||
).await?;
|
||||
results.push(BatchStoreResult {
|
||||
id: id.to_string(),
|
||||
status: "stored".to_string(),
|
||||
expires_at: expires_at.map(|ts| ts.to_rfc3339()),
|
||||
});
|
||||
if let Some(existing) =
|
||||
find_dedup_match(&transaction, agent_id, &vector, dedup_threshold).await?
|
||||
{
|
||||
let merged_metadata = merge_metadata(&existing.metadata, &metadata);
|
||||
let refreshed_expires_at = expires_at.or(existing.expires_at);
|
||||
transaction
|
||||
.execute(
|
||||
r#"
|
||||
UPDATE memories
|
||||
SET metadata = $2,
|
||||
created_at = NOW(),
|
||||
expires_at = $3
|
||||
WHERE id = $1
|
||||
"#,
|
||||
&[&existing.id, &merged_metadata, &refreshed_expires_at],
|
||||
)
|
||||
.await
|
||||
.context("Failed to update deduplicated batch memory")?;
|
||||
results.push(BatchStoreResult {
|
||||
id: existing.id.to_string(),
|
||||
status: "deduplicated".to_string(),
|
||||
deduplicated: true,
|
||||
expires_at: refreshed_expires_at.map(|ts| ts.to_rfc3339()),
|
||||
});
|
||||
} else {
|
||||
let id = Uuid::new_v4();
|
||||
transaction.execute(
|
||||
r#"INSERT INTO memories (id, agent_id, content, embedding, keywords, metadata, expires_at) VALUES ($1, $2, $3, $4, $5, $6, $7)"#,
|
||||
&[&id, &agent_id, &content, &vector, &keywords, &metadata, &expires_at],
|
||||
).await?;
|
||||
results.push(BatchStoreResult {
|
||||
id: id.to_string(),
|
||||
status: "stored".to_string(),
|
||||
deduplicated: false,
|
||||
expires_at: expires_at.map(|ts| ts.to_rfc3339()),
|
||||
});
|
||||
}
|
||||
}
|
||||
transaction.commit().await?;
|
||||
Ok(results)
|
||||
|
||||
@@ -103,7 +103,7 @@ pub async fn execute(state: &Arc<AppState>, arguments: Value) -> Result<String>
|
||||
// 5. Batch DB insert (single transaction for atomicity)
|
||||
let results = state
|
||||
.db
|
||||
.batch_store_memories(agent_id, processed_entries)
|
||||
.batch_store_memories(agent_id, processed_entries, state.config.dedup.threshold)
|
||||
.await
|
||||
.context("Failed to batch store memories")?;
|
||||
|
||||
|
||||
@@ -15,7 +15,7 @@ pub fn get_tool_definitions() -> Vec<Value> {
|
||||
vec![
|
||||
json!({
|
||||
"name": "store",
|
||||
"description": "Store a memory with automatic embedding generation and keyword extraction. The memory will be associated with the agent_id for isolated retrieval.",
|
||||
"description": "Store a memory with automatic embedding generation and keyword extraction. Near-duplicate memories for the same agent are deduplicated automatically by similarity, with metadata merged and timestamps refreshed.",
|
||||
"inputSchema": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
@@ -41,7 +41,7 @@ pub fn get_tool_definitions() -> Vec<Value> {
|
||||
}),
|
||||
json!({
|
||||
"name": "batch_store",
|
||||
"description": "Store multiple memories with automatic embedding generation and keyword extraction. Accepts 1-50 entries and stores them atomically in a single transaction.",
|
||||
"description": "Store multiple memories with automatic embedding generation and keyword extraction. Accepts 1-50 entries, stores them atomically in a single transaction, and applies the same automatic deduplication rules as single-store.",
|
||||
"inputSchema": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
|
||||
@@ -60,20 +60,26 @@ pub async fn execute(state: &Arc<AppState>, arguments: Value) -> Result<String>
|
||||
&keywords,
|
||||
metadata,
|
||||
expires_at.clone(),
|
||||
state.config.dedup.threshold,
|
||||
)
|
||||
.await
|
||||
.context("Failed to store memory")?;
|
||||
|
||||
info!("Memory stored with ID: {}", id);
|
||||
info!(
|
||||
"Memory {} with ID: {}",
|
||||
if id.deduplicated { "deduplicated" } else { "stored" },
|
||||
id.id
|
||||
);
|
||||
|
||||
Ok(serde_json::json!({
|
||||
"success": true,
|
||||
"id": id.to_string(),
|
||||
"id": id.id.to_string(),
|
||||
"agent_id": agent_id,
|
||||
"deduplicated": id.deduplicated,
|
||||
"keywords": keywords,
|
||||
"embedding_dimension": embedding.len(),
|
||||
"ttl": ttl,
|
||||
"expires_at": expires_at.as_ref().map(|ts| ts.to_rfc3339())
|
||||
"expires_at": id.expires_at.as_ref().map(|ts| ts.to_rfc3339())
|
||||
})
|
||||
.to_string())
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user