175 lines
6.4 KiB
Python
175 lines
6.4 KiB
Python
import hashlib
|
||
import logging
|
||
import os
|
||
from typing import Optional
|
||
|
||
from config import config
|
||
from extractor import MeetingExtraction, extract_meeting_info
|
||
from graph_store import graph_store
|
||
from meeting_state import MeetingStateStore
|
||
from raw_store import raw_meeting_store
|
||
from vector_store import meeting_vector_store
|
||
|
||
logger = logging.getLogger(__name__)
|
||
|
||
state_store = MeetingStateStore(config.state_path)
|
||
|
||
|
||
class MeetingProcessor:
|
||
def process_meeting_file(self, filepath: str, force: bool = False) -> Optional[str]:
|
||
with open(filepath, "r", encoding="utf-8") as f:
|
||
text = f.read()
|
||
return self.process_meeting_text(text, force=force)
|
||
|
||
def process_meeting_text(self, text: str, force: bool = False) -> Optional[str]:
|
||
content_hash = self._compute_content_hash(text)
|
||
|
||
if not force and state_store.has_content_hash(content_hash):
|
||
print("\n检测到重复内容,已跳过。")
|
||
logger.info("Duplicate content hash skipped: %s", content_hash[:12])
|
||
return None
|
||
|
||
if not force:
|
||
similar = meeting_vector_store.find_similar_text(text, threshold=0.92)
|
||
if similar:
|
||
meta = similar["metadata"]
|
||
print(
|
||
f"\n发现相似会议:{meta.get('title', '')} ({meta.get('date', '')}) "
|
||
f"相似度 {similar['score']:.2%}"
|
||
)
|
||
while True:
|
||
choice = input("选择 [s]跳过 / [o]覆盖(默认 s):").strip().lower() or "s"
|
||
if choice == "s":
|
||
logger.info("Skipped similar meeting: %s", meta.get("title", ""))
|
||
return None
|
||
if choice == "o":
|
||
force = True
|
||
break
|
||
print("请输入 s 或 o。")
|
||
|
||
meeting_data = self._extract(text)
|
||
if not meeting_data:
|
||
logger.error("Failed to extract meeting information")
|
||
return None
|
||
|
||
data_dict = meeting_data.model_dump()
|
||
data_dict["_content_hash"] = content_hash
|
||
data_dict["_graph_meeting_id"] = meeting_vector_store._meeting_id(data_dict)
|
||
|
||
should_skip = self._handle_duplicate(data_dict, force)
|
||
if should_skip:
|
||
return None
|
||
|
||
meeting_title = data_dict.get("title", "")
|
||
meeting_date = data_dict.get("date", "")
|
||
raw_path = raw_meeting_store.save(text, title=meeting_title, date=meeting_date)
|
||
|
||
data_dict["_original_text"] = text
|
||
data_dict["_original_text_path"] = raw_path
|
||
|
||
meeting_filename = f"{meeting_vector_store._meeting_id(data_dict)}.md"
|
||
|
||
data_dict["action_items"] = state_store.merge_action_items(
|
||
data_dict.get("action_items", []),
|
||
meeting_title,
|
||
meeting_date,
|
||
meeting_filename,
|
||
)
|
||
data_dict["metrics"] = state_store.merge_metrics(
|
||
data_dict.get("metrics", []),
|
||
meeting_title,
|
||
meeting_date,
|
||
meeting_filename,
|
||
)
|
||
|
||
state_store.add_content_hash(content_hash, meeting_title, meeting_date, meeting_filename)
|
||
state_store.save()
|
||
meeting_vector_store.add_meeting(data_dict)
|
||
graph_store.upsert_meeting_subgraph(data_dict)
|
||
|
||
logger.info("Meeting processed: %s", meeting_title)
|
||
return raw_path
|
||
|
||
def _handle_duplicate(self, data_dict: dict, force: bool) -> bool:
|
||
title = data_dict.get("title", "")
|
||
date = data_dict.get("date", "")
|
||
existing = meeting_vector_store.find_meeting(title, date)
|
||
|
||
if not existing:
|
||
return False
|
||
|
||
if force:
|
||
logger.info("Duplicate meeting found; overwriting in force mode: %s", title)
|
||
self._remove_old(data_dict, existing)
|
||
return False
|
||
|
||
print(f"\n发现重复会议:{title} ({date})")
|
||
while True:
|
||
choice = input("选择 [s]跳过 / [o]覆盖(默认 s):").strip().lower() or "s"
|
||
if choice == "s":
|
||
logger.info("Skipped duplicate meeting: %s", title)
|
||
return True
|
||
if choice == "o":
|
||
self._remove_old(data_dict, existing)
|
||
return False
|
||
print("请输入 s 或 o。")
|
||
|
||
def _remove_old(self, data_dict: dict, existing: Optional[dict] = None):
|
||
meeting_id = meeting_vector_store._meeting_id(data_dict)
|
||
meeting_vector_store.remove_meeting(meeting_id)
|
||
graph_store.remove_meeting_subgraph(meeting_id)
|
||
|
||
new_hash = data_dict.get("_content_hash", "")
|
||
if new_hash:
|
||
state_store.remove_content_hash(new_hash)
|
||
|
||
if existing:
|
||
old_hash = existing.get("content_hash", "")
|
||
if old_hash and old_hash != new_hash:
|
||
state_store.remove_content_hash(old_hash)
|
||
|
||
logger.info("Removed old meeting artifacts: %s", data_dict.get("title", ""))
|
||
|
||
def _compute_content_hash(self, text: str) -> str:
|
||
normalized = text.strip().replace("\r\n", "\n")
|
||
return hashlib.sha256(normalized.encode("utf-8")).hexdigest()
|
||
|
||
def _extract(self, text: str) -> Optional[MeetingExtraction]:
|
||
try:
|
||
return extract_meeting_info(text)
|
||
except Exception as exc:
|
||
logger.error("LLM extraction failed: %s", exc)
|
||
return None
|
||
|
||
def query(self, question: str, top_k: int = 3) -> str:
|
||
vector_context = meeting_vector_store.query_as_context(question, top_k=top_k)
|
||
graph_results = graph_store.search_facts(question, limit=top_k)
|
||
|
||
parts = []
|
||
if vector_context:
|
||
parts.append("=== Vector Context ===\n" + vector_context)
|
||
|
||
if graph_results:
|
||
graph_lines = []
|
||
for idx, row in enumerate(graph_results, start=1):
|
||
title = row.get("title", row.get("kind", "graph"))
|
||
text = row.get("text", "")
|
||
date = row.get("date", "")
|
||
suffix = f" ({date})" if date else ""
|
||
graph_lines.append(f"[{idx}] {title}{suffix}\n{text}")
|
||
parts.append("=== Graph Facts ===\n" + "\n\n".join(graph_lines))
|
||
|
||
return "\n\n".join(parts)
|
||
|
||
def stats(self) -> dict:
|
||
return {
|
||
"vector_index": meeting_vector_store.get_stats(),
|
||
"graph": graph_store.get_stats(),
|
||
"state": state_store.get_stats(),
|
||
"raw_dir": config.storage.raw_dir,
|
||
"state_path": config.state_path,
|
||
}
|
||
|
||
|
||
meeting_processor = MeetingProcessor()
|