meeting_memory/meeting_processor.py

175 lines
6.4 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

import hashlib
import logging
import os
from typing import Optional
from config import config
from extractor import MeetingExtraction, extract_meeting_info
from graph_store import graph_store
from meeting_state import MeetingStateStore
from raw_store import raw_meeting_store
from vector_store import meeting_vector_store
logger = logging.getLogger(__name__)
state_store = MeetingStateStore(config.state_path)
class MeetingProcessor:
def process_meeting_file(self, filepath: str, force: bool = False) -> Optional[str]:
with open(filepath, "r", encoding="utf-8") as f:
text = f.read()
return self.process_meeting_text(text, force=force)
def process_meeting_text(self, text: str, force: bool = False) -> Optional[str]:
content_hash = self._compute_content_hash(text)
if not force and state_store.has_content_hash(content_hash):
print("\n检测到重复内容,已跳过。")
logger.info("Duplicate content hash skipped: %s", content_hash[:12])
return None
if not force:
similar = meeting_vector_store.find_similar_text(text, threshold=0.92)
if similar:
meta = similar["metadata"]
print(
f"\n发现相似会议:{meta.get('title', '')} ({meta.get('date', '')}) "
f"相似度 {similar['score']:.2%}"
)
while True:
choice = input("选择 [s]跳过 / [o]覆盖(默认 s").strip().lower() or "s"
if choice == "s":
logger.info("Skipped similar meeting: %s", meta.get("title", ""))
return None
if choice == "o":
force = True
break
print("请输入 s 或 o。")
meeting_data = self._extract(text)
if not meeting_data:
logger.error("Failed to extract meeting information")
return None
data_dict = meeting_data.model_dump()
data_dict["_content_hash"] = content_hash
data_dict["_graph_meeting_id"] = meeting_vector_store._meeting_id(data_dict)
should_skip = self._handle_duplicate(data_dict, force)
if should_skip:
return None
meeting_title = data_dict.get("title", "")
meeting_date = data_dict.get("date", "")
raw_path = raw_meeting_store.save(text, title=meeting_title, date=meeting_date)
data_dict["_original_text"] = text
data_dict["_original_text_path"] = raw_path
meeting_filename = f"{meeting_vector_store._meeting_id(data_dict)}.md"
data_dict["action_items"] = state_store.merge_action_items(
data_dict.get("action_items", []),
meeting_title,
meeting_date,
meeting_filename,
)
data_dict["metrics"] = state_store.merge_metrics(
data_dict.get("metrics", []),
meeting_title,
meeting_date,
meeting_filename,
)
state_store.add_content_hash(content_hash, meeting_title, meeting_date, meeting_filename)
state_store.save()
meeting_vector_store.add_meeting(data_dict)
graph_store.upsert_meeting_subgraph(data_dict)
logger.info("Meeting processed: %s", meeting_title)
return raw_path
def _handle_duplicate(self, data_dict: dict, force: bool) -> bool:
title = data_dict.get("title", "")
date = data_dict.get("date", "")
existing = meeting_vector_store.find_meeting(title, date)
if not existing:
return False
if force:
logger.info("Duplicate meeting found; overwriting in force mode: %s", title)
self._remove_old(data_dict, existing)
return False
print(f"\n发现重复会议:{title} ({date})")
while True:
choice = input("选择 [s]跳过 / [o]覆盖(默认 s").strip().lower() or "s"
if choice == "s":
logger.info("Skipped duplicate meeting: %s", title)
return True
if choice == "o":
self._remove_old(data_dict, existing)
return False
print("请输入 s 或 o。")
def _remove_old(self, data_dict: dict, existing: Optional[dict] = None):
meeting_id = meeting_vector_store._meeting_id(data_dict)
meeting_vector_store.remove_meeting(meeting_id)
graph_store.remove_meeting_subgraph(meeting_id)
new_hash = data_dict.get("_content_hash", "")
if new_hash:
state_store.remove_content_hash(new_hash)
if existing:
old_hash = existing.get("content_hash", "")
if old_hash and old_hash != new_hash:
state_store.remove_content_hash(old_hash)
logger.info("Removed old meeting artifacts: %s", data_dict.get("title", ""))
def _compute_content_hash(self, text: str) -> str:
normalized = text.strip().replace("\r\n", "\n")
return hashlib.sha256(normalized.encode("utf-8")).hexdigest()
def _extract(self, text: str) -> Optional[MeetingExtraction]:
try:
return extract_meeting_info(text)
except Exception as exc:
logger.error("LLM extraction failed: %s", exc)
return None
def query(self, question: str, top_k: int = 3) -> str:
vector_context = meeting_vector_store.query_as_context(question, top_k=top_k)
graph_results = graph_store.search_facts(question, limit=top_k)
parts = []
if vector_context:
parts.append("=== Vector Context ===\n" + vector_context)
if graph_results:
graph_lines = []
for idx, row in enumerate(graph_results, start=1):
title = row.get("title", row.get("kind", "graph"))
text = row.get("text", "")
date = row.get("date", "")
suffix = f" ({date})" if date else ""
graph_lines.append(f"[{idx}] {title}{suffix}\n{text}")
parts.append("=== Graph Facts ===\n" + "\n\n".join(graph_lines))
return "\n\n".join(parts)
def stats(self) -> dict:
return {
"vector_index": meeting_vector_store.get_stats(),
"graph": graph_store.get_stats(),
"state": state_store.get_stats(),
"raw_dir": config.storage.raw_dir,
"state_path": config.state_path,
}
meeting_processor = MeetingProcessor()