meeting_memory/meeting_processor.py

import hashlib
import logging
import os
from typing import Optional

from config import config
from extractor import MeetingExtraction, extract_meeting_info
from graph_store import graph_store
from meeting_state import MeetingStateStore
from raw_store import raw_meeting_store
from vector_store import meeting_vector_store

logger = logging.getLogger(__name__)

state_store = MeetingStateStore(config.state_path)


class MeetingProcessor:
    def process_meeting_file(self, filepath: str, force: bool = False) -> Optional[str]:
        with open(filepath, "r", encoding="utf-8") as f:
            text = f.read()
        return self.process_meeting_text(text, force=force)

    def process_meeting_text(self, text: str, force: bool = False) -> Optional[str]:
        content_hash = self._compute_content_hash(text)

        if not force and state_store.has_content_hash(content_hash):
            print("\n检测到重复内容，已跳过。")
            logger.info("Duplicate content hash skipped: %s", content_hash[:12])
            return None

        if not force:
            similar = meeting_vector_store.find_similar_text(text, threshold=0.92)
            if similar:
                meta = similar["metadata"]
                print(
                    f"\n发现相似会议：{meta.get('title', '')} ({meta.get('date', '')}) "
                    f"相似度 {similar['score']:.2%}"
                )
                while True:
                    choice = input("选择 [s]跳过 / [o]覆盖（默认 s）：").strip().lower() or "s"
                    if choice == "s":
                        logger.info("Skipped similar meeting: %s", meta.get("title", ""))
                        return None
                    if choice == "o":
                        force = True
                        break
                    print("请输入 s 或 o。")

        meeting_data = self._extract(text)
        if not meeting_data:
            logger.error("Failed to extract meeting information")
            return None

        data_dict = meeting_data.model_dump()
        data_dict["_content_hash"] = content_hash
        data_dict["_graph_meeting_id"] = meeting_vector_store._meeting_id(data_dict)

        should_skip = self._handle_duplicate(data_dict, force)
        if should_skip:
            return None

        meeting_title = data_dict.get("title", "")
        meeting_date = data_dict.get("date", "")
        raw_path = raw_meeting_store.save(text, title=meeting_title, date=meeting_date)

        data_dict["_original_text"] = text
        data_dict["_original_text_path"] = raw_path

        meeting_filename = f"{meeting_vector_store._meeting_id(data_dict)}.md"

        data_dict["action_items"] = state_store.merge_action_items(
            data_dict.get("action_items", []),
            meeting_title,
            meeting_date,
            meeting_filename,
        )
        data_dict["metrics"] = state_store.merge_metrics(
            data_dict.get("metrics", []),
            meeting_title,
            meeting_date,
            meeting_filename,
        )

        state_store.add_content_hash(content_hash, meeting_title, meeting_date, meeting_filename)
        state_store.save()
        meeting_vector_store.add_meeting(data_dict)
        graph_store.upsert_meeting_subgraph(data_dict)

        logger.info("Meeting processed: %s", meeting_title)
        return raw_path

    def _handle_duplicate(self, data_dict: dict, force: bool) -> bool:
        title = data_dict.get("title", "")
        date = data_dict.get("date", "")
        existing = meeting_vector_store.find_meeting(title, date)

        if not existing:
            return False

        if force:
            logger.info("Duplicate meeting found; overwriting in force mode: %s", title)
            self._remove_old(data_dict, existing)
            return False

        print(f"\n发现重复会议：{title} ({date})")
        while True:
            choice = input("选择 [s]跳过 / [o]覆盖（默认 s）：").strip().lower() or "s"
            if choice == "s":
                logger.info("Skipped duplicate meeting: %s", title)
                return True
            if choice == "o":
                self._remove_old(data_dict, existing)
                return False
            print("请输入 s 或 o。")

    def _remove_old(self, data_dict: dict, existing: Optional[dict] = None):
        meeting_id = meeting_vector_store._meeting_id(data_dict)
        meeting_vector_store.remove_meeting(meeting_id)
        graph_store.remove_meeting_subgraph(meeting_id)

        new_hash = data_dict.get("_content_hash", "")
        if new_hash:
            state_store.remove_content_hash(new_hash)

        if existing:
            old_hash = existing.get("content_hash", "")
            if old_hash and old_hash != new_hash:
                state_store.remove_content_hash(old_hash)

        logger.info("Removed old meeting artifacts: %s", data_dict.get("title", ""))

    def _compute_content_hash(self, text: str) -> str:
        normalized = text.strip().replace("\r\n", "\n")
        return hashlib.sha256(normalized.encode("utf-8")).hexdigest()

    def _extract(self, text: str) -> Optional[MeetingExtraction]:
        try:
            return extract_meeting_info(text)
        except Exception as exc:
            logger.error("LLM extraction failed: %s", exc)
            return None

    def query(self, question: str, top_k: int = 3) -> str:
        vector_context = meeting_vector_store.query_as_context(question, top_k=top_k)
        graph_results = graph_store.search_facts(question, limit=top_k)

        parts = []
        if vector_context:
            parts.append("=== Vector Context ===\n" + vector_context)

        if graph_results:
            graph_lines = []
            for idx, row in enumerate(graph_results, start=1):
                title = row.get("title", row.get("kind", "graph"))
                text = row.get("text", "")
                date = row.get("date", "")
                suffix = f" ({date})" if date else ""
                graph_lines.append(f"[{idx}] {title}{suffix}\n{text}")
            parts.append("=== Graph Facts ===\n" + "\n\n".join(graph_lines))

        return "\n\n".join(parts)

    def stats(self) -> dict:
        return {
            "vector_index": meeting_vector_store.get_stats(),
            "graph": graph_store.get_stats(),
            "state": state_store.get_stats(),
            "raw_dir": config.storage.raw_dir,
            "state_path": config.state_path,
        }


meeting_processor = MeetingProcessor()