commit d5c9368c1b25b07989240c080fefda7d6df3b6c7 Author: Bifang <915779419@qq.com> Date: Tue Jun 9 10:38:24 2026 +0800 测试版本第一版 diff --git a/.env.example b/.env.example new file mode 100644 index 0000000..afbf44c --- /dev/null +++ b/.env.example @@ -0,0 +1,16 @@ +# LLM API +LLM_API_KEY=sk-your-api-key +LLM_BASE_URL=https://api.deepseek.com/v1 +LLM_MODEL=deepseek-chat + +# Embedding API +EMBEDDING_API_KEY=sk-your-embedding-key +EMBEDDING_BASE_URL=https://api.openai.com/v1 +EMBEDDING_MODEL=text-embedding-3-small + +# Neo4j +NEO4J_ENABLED=false +NEO4J_URI=bolt://localhost:7687 +NEO4J_USER=neo4j +NEO4J_PASSWORD= +NEO4J_DATABASE=neo4j diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..00edad4 --- /dev/null +++ b/.gitignore @@ -0,0 +1,4 @@ +__pycache__/ +vector_store_data/ +.env +.vscode diff --git a/MIGRATION_TASKS.md b/MIGRATION_TASKS.md new file mode 100644 index 0000000..95d0ef7 --- /dev/null +++ b/MIGRATION_TASKS.md @@ -0,0 +1,18 @@ +# Migration Tasks + +## In Progress + +- [ ] Rework deduplication and identity resolution for entities, action items, and metrics + +## Todo + +## Done + +- [x] Remove Obsidian from the project documentation and dependency surface +- [x] Remove Obsidian from the runtime processing pipeline +- [x] Move raw meeting archival to `data/raw` +- [x] Move meeting state storage to `data/meeting_state.json` +- [x] Introduce Neo4j configuration and a minimal graph storage layer +- [x] Write extracted meeting entities and relations into Neo4j +- [x] Add graph statistics to the CLI status output +- [x] Redesign retrieval to combine vector recall with graph facts diff --git a/README.md b/README.md new file mode 100644 index 0000000..9c47849 --- /dev/null +++ b/README.md @@ -0,0 +1,114 @@ +# 会议纪要长期记忆系统 + +基于 `LLM + LlamaIndex + 本地状态存储` 的会议纪要长期记忆原型,当前聚焦三件事: + +- 会议内容结构化抽取 +- 跨会议行动项与指标状态追踪 +- 语义检索与重复内容过滤 + +## 当前处理链路 + +```text +meeting.md + -> 内容哈希去重 + -> 语义相似去重 + -> LLM 结构化抽取 + -> 状态合并 + -> 原文归档到 data/raw + -> 写入向量索引 +``` + +当前版本已经移除了 `Obsidian` 作为主流程依赖,后续会继续引入图数据库来承载关系层。 + +## 快速开始 + +```bash +cd meeting_memory + +python -m venv .venv +.venv\Scripts\pip install -r requirements.txt + +copy .env.example .env +# 然后补充 API 配置 + +.venv\Scripts\python main.py process meeting_example.md +``` + +## 用法 + +```bash +python main.py +python main.py process meeting_example.md +python main.py process meeting_example.md -f +python main.py query "弱光指标目标值是多少" +python main.py stats +python main.py text "今天会议讨论了..." +python main.py batch "meetings/*.md" -f +``` + +## 目录 + +```text +meeting_memory/ +├── config.py 配置 +├── extractor.py LLM 结构化抽取 +├── meeting_processor.py 主处理流程 +├── meeting_state.py 跨会议状态追踪 +├── raw_store.py 原文归档 +├── vector_store.py 向量索引与语义检索 +├── main.py CLI 入口 +├── data/ +│ ├── raw/ 原始会议文本归档 +│ └── meeting_state.json 状态持久化 +└── vector_store_data/ 向量索引持久化目录 +``` + +## 核心能力 + +### 1. 结构化抽取 + +从会议文本中提取: + +- 标题、日期、参会人 +- 实体 +- 关系 +- 行动项 +- 指标 +- 决策 +- 摘要 + +### 2. 长期状态追踪 + +- 行动项按 `task + assignee` 做稳定 ID +- 指标按 `metric_name + owner` 做稳定 ID +- 保留历史状态演化 +- 支持跨会议合并 + +### 3. 双重去重 + +- 内容哈希精确去重 +- 语义相似度去重 + +### 4. 语义检索 + +- 会议内容写入向量库 +- 支持自然语言查询 +- 索引持久化,重启自动加载 + +## 配置 + +编辑 `.env`: + +```ini +LLM_API_KEY=sk-xxx +LLM_BASE_URL=https://api.deepseek.com/v1 +LLM_MODEL=deepseek-chat + +EMBEDDING_API_KEY=sk-xxx +EMBEDDING_BASE_URL=https://api.openai.com/v1 +EMBEDDING_MODEL=text-embedding-3-small +``` + +## 迁移计划 + +当前迁移进度见 [MIGRATION_TASKS.md](/d:/github_project/my_code/meeting_memory/MIGRATION_TASKS.md:1)。 diff --git a/config.py b/config.py new file mode 100644 index 0000000..d430240 --- /dev/null +++ b/config.py @@ -0,0 +1,51 @@ +import os + +from dotenv import load_dotenv +from pydantic import BaseModel, Field + +load_dotenv() + +PROJECT_ROOT = os.path.dirname(os.path.abspath(__file__)) + + +class LLMConfig(BaseModel): + api_key: str = Field(default=os.getenv("LLM_API_KEY", "")) + base_url: str = Field(default=os.getenv("LLM_BASE_URL", "https://api.deepseek.com/v1")) + model: str = Field(default=os.getenv("LLM_MODEL", "deepseek-chat")) + max_tokens: int = Field(default=64000) + temperature: float = Field(default=0.95) + + +class EmbeddingConfig(BaseModel): + api_key: str = Field(default=os.getenv("EMBEDDING_API_KEY", "")) + api_base: str = Field(default=os.getenv("EMBEDDING_BASE_URL", "https://api.openai.com/v1")) + model: str = Field(default=os.getenv("EMBEDDING_MODEL", "text-embedding-3-small")) + + +class StorageConfig(BaseModel): + data_dir: str = Field(default=os.path.join(PROJECT_ROOT, "data")) + raw_dir: str = Field(default=os.path.join(PROJECT_ROOT, "data", "raw")) + + +class VectorStoreConfig(BaseModel): + persist_dir: str = Field(default=os.path.join(PROJECT_ROOT, "vector_store_data")) + + +class Neo4jConfig(BaseModel): + enabled: bool = Field(default=os.getenv("NEO4J_ENABLED", "false").lower() == "true") + uri: str = Field(default=os.getenv("NEO4J_URI", "bolt://localhost:7687")) + user: str = Field(default=os.getenv("NEO4J_USER", "neo4j")) + password: str = Field(default=os.getenv("NEO4J_PASSWORD", "")) + database: str = Field(default=os.getenv("NEO4J_DATABASE", "neo4j")) + + +class ProjectConfig(BaseModel): + llm: LLMConfig = Field(default_factory=LLMConfig) + embedding: EmbeddingConfig = Field(default_factory=EmbeddingConfig) + storage: StorageConfig = Field(default_factory=StorageConfig) + vector_store: VectorStoreConfig = Field(default_factory=VectorStoreConfig) + neo4j: Neo4jConfig = Field(default_factory=Neo4jConfig) + state_path: str = Field(default=os.path.join(PROJECT_ROOT, "data", "meeting_state.json")) + + +config = ProjectConfig() diff --git a/extractor.py b/extractor.py new file mode 100644 index 0000000..d137af8 --- /dev/null +++ b/extractor.py @@ -0,0 +1,157 @@ +import json +import logging +import re +from typing import List, Optional +from pydantic import BaseModel + +from openai import OpenAI + +from config import config + +logger = logging.getLogger(__name__) + +client = OpenAI( + api_key=config.llm.api_key or None, + base_url=config.llm.base_url if config.llm.base_url else None, +) + + +class Entity(BaseModel): + name: str + entity_type: str + description: str = "" + + +class Relation(BaseModel): + subject: str + subject_type: str + predicate: str + object: str + object_type: str + description: str = "" + + +class ActionItem(BaseModel): + task: str + assignee: str = "" + deadline: str = "" + status: str = "待办" + priority: str = "中" + + +class Decision(BaseModel): + content: str + proposer: str = "" + status: str = "已决" + + +class MeetingMetric(BaseModel): + metric_name: str + value: str + target: str = "" + owner: str = "" + trend: str = "" + + +class MeetingExtraction(BaseModel): + title: str + date: str = "" + participants: List[str] = [] + agenda: List[str] = [] + entities: List[Entity] = [] + relations: List[Relation] = [] + action_items: List[ActionItem] = [] + decisions: List[Decision] = [] + metrics: List[MeetingMetric] = [] + summary: str = "" + + +EXTRACTION_SYSTEM_PROMPT = """ +你是一个专业的会议纪要信息抽取专家。你的任务是从中文会议记录中抽取结构化信息,并严格按照要求的JSON格式返回。 + +## 抽取内容 + +### 1. 实体 +- 人物:参会人员、提及的人员 +- 组织/部门:公司、部门、团队 +- 项目/任务:正在进行的项目、任务 +- 指标/KPI:关键绩效指标(如转化率、退单率等) +- 概念/制度:管理概念、制度要求 +- 地点:会议地点、项目地点 + +### 2. 关系 (主体-关系谓词-客体) +抽取事实性关系,例如: +- {"subject": "建维部", "subject_type": "组织", "predicate": "负责", "object": "网络运维", "object_type": "任务", "description": ""} +- {"subject": "弱光指标", "subject_type": "指标", "predicate": "目标值", "object": "0.5以下", "object_type": "数值", "description": ""} + +### 3. 行动项 +谁负责什么任务,截止时间,优先级 + +### 4. 决策 +做出的决定和结论 + +### 5. 指标数据 +具体的数字指标:当前值、目标值、负责人、趋势(向好/持平/恶化) + +## 规则 +- 只提取事实性信息 +- 过滤比喻、假设、主观评价 +- 数字指标要精确提取 +- entities、relations、action_items、decisions、metrics 如果没有则返回空数组 +""" + + +def _call_llm(system: str, user: str) -> str: + response = client.chat.completions.create( + model=config.llm.model, + messages=[ + {"role": "system", "content": system}, + {"role": "user", "content": user}, + ], + max_tokens=config.llm.max_tokens, + temperature=config.llm.temperature, + ) + content = response.choices[0].message.content + if content is None: + raise ValueError("LLM returned empty response") + return content + + +def extract_meeting_info(text: str) -> MeetingExtraction: + user_prompt = f""" +从以下会议记录中抽取结构化信息。 + +JSON字段说明: +- title: 会议标题 +- date: 会议日期 +- participants: 参会人列表 +- agenda: 议程列表 +- entities: 实体列表,每个实体包含 name(名称), entity_type(类型), description(描述) +- relations: 关系列表,每个关系包含 subject(主体), subject_type(主体类型), predicate(关系谓词), object(客体), object_type(客体类型), description(描述) +- action_items: 行动项列表,每条包含 task(任务), assignee(负责人), deadline(截止时间), status(状态), priority(优先级) +- decisions: 决策列表,每条包含 content(决策内容), proposer(提出人), status(状态) +- metrics: 指标列表,每条包含 metric_name(指标名), value(当前值), target(目标值), owner(负责人), trend(趋势) +- summary: 会议摘要 + +请直接返回JSON对象。不要包含任何额外说明文字。 + +会议记录: +{text} +""" + content = _call_llm(EXTRACTION_SYSTEM_PROMPT, user_prompt) + data = _try_parse_json(content) + return MeetingExtraction(**data) + + +def _try_parse_json(content: str) -> dict: + try: + return json.loads(content) + except json.JSONDecodeError: + logger.warning("JSON解析失败,尝试修复...") + match = re.search(r'\{.*\}', content, re.DOTALL) + if match: + try: + return json.loads(match.group()) + except json.JSONDecodeError as e: + logger.error(f"修复后的JSON仍无法解析: {e}") + raise \ No newline at end of file diff --git a/graph_store.py b/graph_store.py new file mode 100644 index 0000000..5712dbe --- /dev/null +++ b/graph_store.py @@ -0,0 +1,250 @@ +import logging +from typing import Any, Dict, List + +from config import config + +logger = logging.getLogger(__name__) + + +class Neo4jGraphStore: + def __init__(self): + self._driver = None + self._enabled = False + self._connect() + + def _connect(self): + if not config.neo4j.enabled: + logger.info("Neo4j graph store disabled") + return + + try: + from neo4j import GraphDatabase + except ImportError: + logger.warning("neo4j package is not installed") + return + + if not config.neo4j.password: + logger.warning("Neo4j is enabled but NEO4J_PASSWORD is empty") + return + + self._driver = GraphDatabase.driver( + config.neo4j.uri, + auth=(config.neo4j.user, config.neo4j.password), + ) + self._enabled = True + + @property + def enabled(self) -> bool: + return self._enabled and self._driver is not None + + def close(self): + if self._driver is not None: + self._driver.close() + + def run_query(self, query: str, **params) -> List[Dict[str, Any]]: + if not self.enabled: + return [] + + with self._driver.session(database=config.neo4j.database) as session: + result = session.run(query, **params) + return [record.data() for record in result] + + def initialize_schema(self): + if not self.enabled: + return + + statements = [ + "CREATE CONSTRAINT meeting_id IF NOT EXISTS FOR (m:Meeting) REQUIRE m.meeting_id IS UNIQUE", + "CREATE CONSTRAINT entity_name IF NOT EXISTS FOR (e:Entity) REQUIRE e.name IS UNIQUE", + "CREATE INDEX meeting_title IF NOT EXISTS FOR (m:Meeting) ON (m.title)", + "CREATE INDEX entity_type IF NOT EXISTS FOR (e:Entity) ON (e.entity_type)", + ] + for statement in statements: + self.run_query(statement) + + def get_stats(self) -> Dict[str, Any]: + if not self.enabled: + return {"enabled": False} + + rows = self.run_query( + """ + CALL { + MATCH (m:Meeting) + RETURN count(m) AS meetings + } + CALL { + MATCH (e:Entity) + RETURN count(e) AS entities + } + RETURN meetings, entities + """ + ) + if not rows: + return {"enabled": True, "meetings": 0, "entities": 0} + return {"enabled": True, **rows[0]} + + def upsert_meeting_subgraph(self, meeting_data: dict) -> None: + if not self.enabled: + return + + meeting_id = meeting_data.get("_graph_meeting_id") + if not meeting_id: + return + + self.initialize_schema() + self.run_query( + """ + MERGE (m:Meeting {meeting_id: $meeting_id}) + SET m.title = $title, + m.date = $date, + m.summary = $summary, + m.content_hash = $content_hash, + m.updated_at = datetime() + """, + meeting_id=meeting_id, + title=meeting_data.get("title", ""), + date=meeting_data.get("date", ""), + summary=meeting_data.get("summary", ""), + content_hash=meeting_data.get("_content_hash", ""), + ) + + for entity in meeting_data.get("entities", []): + self._upsert_entity(meeting_id, entity) + + for participant in meeting_data.get("participants", []): + self._upsert_entity( + meeting_id, + {"name": participant, "entity_type": "participant", "description": ""}, + ) + + for relation in meeting_data.get("relations", []): + self._upsert_relation(meeting_id, relation, meeting_data.get("date", "")) + + def _upsert_entity(self, meeting_id: str, entity: dict) -> None: + name = entity.get("name", "").strip() + if not name: + return + + self.run_query( + """ + MATCH (m:Meeting {meeting_id: $meeting_id}) + MERGE (e:Entity {name: $name}) + SET e.entity_type = CASE + WHEN $entity_type <> '' THEN $entity_type + ELSE coalesce(e.entity_type, '') + END, + e.description = CASE + WHEN $description <> '' THEN $description + ELSE coalesce(e.description, '') + END, + e.updated_at = datetime() + MERGE (m)-[:MENTIONS]->(e) + """, + meeting_id=meeting_id, + name=name, + entity_type=entity.get("entity_type", ""), + description=entity.get("description", ""), + ) + + def _upsert_relation(self, meeting_id: str, relation: dict, meeting_date: str) -> None: + subject = relation.get("subject", "").strip() + predicate = relation.get("predicate", "").strip() + obj = relation.get("object", "").strip() + if not subject or not predicate or not obj: + return + + self._upsert_entity( + meeting_id, + { + "name": subject, + "entity_type": relation.get("subject_type", ""), + "description": "", + }, + ) + self._upsert_entity( + meeting_id, + { + "name": obj, + "entity_type": relation.get("object_type", ""), + "description": "", + }, + ) + + self.run_query( + """ + MATCH (s:Entity {name: $subject}) + MATCH (o:Entity {name: $object}) + MERGE (s)-[r:RELATES_TO { + meeting_id: $meeting_id, + predicate: $predicate, + object_name: $object + }]->(o) + SET r.description = $description, + r.meeting_date = $meeting_date, + r.updated_at = datetime() + """, + meeting_id=meeting_id, + subject=subject, + predicate=predicate, + object=obj, + description=relation.get("description", ""), + meeting_date=meeting_date, + ) + + def remove_meeting_subgraph(self, meeting_id: str) -> None: + if not self.enabled: + return + + self.run_query( + """ + MATCH (m:Meeting {meeting_id: $meeting_id}) + OPTIONAL MATCH (m)-[mentions:MENTIONS]->(:Entity) + DELETE mentions + WITH m + OPTIONAL MATCH ()-[r:RELATES_TO {meeting_id: $meeting_id}]->() + DELETE r + WITH m + DELETE m + """, + meeting_id=meeting_id, + ) + + def search_facts(self, question: str, limit: int = 5) -> List[Dict[str, Any]]: + if not self.enabled or not question.strip(): + return [] + + rows = self.run_query( + """ + CALL { + MATCH (m:Meeting) + WHERE toLower(m.title) CONTAINS toLower($question) + OR toLower(coalesce(m.summary, '')) CONTAINS toLower($question) + RETURN 'meeting' AS kind, + m.title AS title, + coalesce(m.summary, '') AS text, + m.date AS date + UNION + MATCH (e:Entity) + WHERE toLower(e.name) CONTAINS toLower($question) + OR toLower(coalesce(e.description, '')) CONTAINS toLower($question) + OPTIONAL MATCH (e)-[r:RELATES_TO]-(other:Entity) + RETURN 'entity' AS kind, + e.name AS title, + coalesce( + head(collect( + e.name + ' -[' + coalesce(r.predicate, '') + ']-> ' + coalesce(other.name, '') + )), + coalesce(e.description, '') + ) AS text, + '' AS date + } + RETURN kind, title, text, date + LIMIT $limit + """, + question=question, + limit=limit, + ) + return rows + + +graph_store = Neo4jGraphStore() diff --git a/main.py b/main.py new file mode 100644 index 0000000..3bd184a --- /dev/null +++ b/main.py @@ -0,0 +1,185 @@ +import argparse +import glob as glob_module +import logging +import os +import sys + +if sys.stdout.encoding and sys.stdout.encoding.lower() == "gbk": + sys.stdout.reconfigure(encoding="utf-8") + +logging.basicConfig( + level=logging.INFO, + format="%(asctime)s [%(levelname)s] %(name)s: %(message)s", + datefmt="%H:%M:%S", +) +logger = logging.getLogger(__name__) + + +def cmd_process(args): + from meeting_processor import meeting_processor + + filepath = args.file + if not os.path.exists(filepath): + print(f"错误:文件不存在 {filepath}") + sys.exit(1) + + print(f"正在处理会议文件:{filepath}") + archive_path = meeting_processor.process_meeting_file(filepath, force=getattr(args, "force", False)) + + if archive_path: + print("\n处理完成") + print(f"原文归档:{archive_path}") + else: + print("\n处理失败或已跳过") + sys.exit(1) + + +def cmd_text(args): + from meeting_processor import meeting_processor + + print("正在处理会议文本...") + archive_path = meeting_processor.process_meeting_text(args.text, force=getattr(args, "force", False)) + if archive_path: + print("\n处理完成") + print(f"原文归档:{archive_path}") + else: + print("\n处理失败或已跳过") + + +def cmd_query(args): + from meeting_processor import meeting_processor + + print(f"查询:{args.question}") + print("-" * 40) + result = meeting_processor.query(args.question, top_k=args.top_k) + print(result if result else "未找到相关信息") + + +def cmd_stats(args): + from meeting_processor import meeting_processor + + stats = meeting_processor.stats() + print("会议记忆系统统计") + print("-" * 40) + print(f"向量节点数:{stats.get('vector_index', {}).get('node_count', 0)}") + print(f"Neo4j 启用:{stats.get('graph', {}).get('enabled', False)}") + print(f"图谱会议数:{stats.get('graph', {}).get('meetings', 0)}") + print(f"图谱实体数:{stats.get('graph', {}).get('entities', 0)}") + print(f"行动项数:{stats.get('state', {}).get('action_items_tracked', 0)}") + print(f"指标数:{stats.get('state', {}).get('metrics_tracked', 0)}") + print(f"会议系列数:{stats.get('state', {}).get('meeting_series', 0)}") + print(f"原文归档目录:{stats.get('raw_dir', '')}") + print(f"状态文件:{stats.get('state_path', '')}") + + +def cmd_batch(args): + from meeting_processor import meeting_processor + + files = glob_module.glob(args.pattern, recursive=True) + if not files: + print(f"未匹配到任何文件:{args.pattern}") + sys.exit(1) + + print(f"找到 {len(files)} 个文件,开始批量处理...") + success = 0 + for path in files: + try: + print(f"\n处理:{path}") + result = meeting_processor.process_meeting_file(path, force=getattr(args, "force", False)) + if result: + success += 1 + except Exception as exc: + logger.error("处理失败: %s - %s", path, exc) + + print(f"\n批量处理完成:{success}/{len(files)} 成功") + + +def cmd_interactive(): + from meeting_processor import meeting_processor + + print("会议纪要长期记忆系统") + print("=" * 50) + print("可用命令:") + print(" query <问题> 语义查询会议记忆") + print(" process <路径> 处理会议文件") + print(" stats 查看统计") + print(" help 显示帮助") + print(" exit/quit 退出") + print("=" * 50) + + while True: + try: + line = input("\n> ").strip() + except (EOFError, KeyboardInterrupt): + print() + break + + if not line: + continue + if line in ("exit", "quit", "q"): + break + if line == "help": + print(" query <问题>") + print(" process <路径>") + print(" stats") + print(" exit/quit") + continue + if line == "stats": + cmd_stats(None) + continue + if line.startswith("process "): + filepath = line[8:].strip() + if not os.path.exists(filepath): + print(f"文件不存在:{filepath}") + continue + result = meeting_processor.process_meeting_file(filepath) + print(f"完成:{result}" if result else "处理失败或已跳过") + continue + + question = line[6:].strip() if line.startswith("query ") else line + result = meeting_processor.query(question, top_k=3) + print(result if result else "未找到相关信息") + + print("bye!") + + +def main(): + parser = argparse.ArgumentParser(description="会议纪要长期记忆系统") + subparsers = parser.add_subparsers(dest="command") + + p_process = subparsers.add_parser("process", help="处理会议 markdown 文件") + p_process.add_argument("file", help="会议文件路径") + p_process.add_argument("-f", "--force", action="store_true", help="发现重复时自动覆盖") + + p_text = subparsers.add_parser("text", help="直接处理一段会议文本") + p_text.add_argument("text", help="会议文本内容") + p_text.add_argument("-f", "--force", action="store_true", help="发现重复时自动覆盖") + + p_query = subparsers.add_parser("query", help="语义查询会议记忆") + p_query.add_argument("question", help="查询问题") + p_query.add_argument("--top-k", type=int, default=3, help="返回结果数量") + + subparsers.add_parser("stats", help="查看统计") + + p_batch = subparsers.add_parser("batch", help="批量处理会议文件") + p_batch.add_argument("pattern", help="glob 模式,如 meetings/*.md") + p_batch.add_argument("-f", "--force", action="store_true", help="发现重复时自动覆盖") + + args = parser.parse_args() + + if args.command == "process": + cmd_process(args) + elif args.command == "text": + cmd_text(args) + elif args.command == "query": + cmd_query(args) + elif args.command == "stats": + cmd_stats(args) + elif args.command == "batch": + cmd_batch(args) + else: + cmd_interactive() + + +if __name__ == "__main__": + main() diff --git a/meeting1.md b/meeting1.md new file mode 100644 index 0000000..22f7ecf --- /dev/null +++ b/meeting1.md @@ -0,0 +1,27 @@ +会议概述 +会议主要围绕宽带运维指标、综合行政与工会管理、市场政企业务推进、满意度提升及年度考核准备等议题展开,旨在总结阶段性工作进度,协调跨部门资源,明确后续重点任务与考核要求。 + +主要讨论点 +宽带运维与网络质量:上周上门量及安装进度受天气影响,弱光指标趋近目标,FPTR达标但主动过境偏后。PCDN专线在学校端持续恶化,已报市公司分析IP并拟限速。超频基站故障已恢复,专线巡检进度符合预期。客服培训后内部机房问题已梳理清单。 +综合行政与工会事务:2025年剩余两项工作按原计划推进。工会经费压减,食堂改造拟于5月结合更替修补进行,拟引入自饮机降本。第四届体育文化节筹备中,因主力选手受伤需各部门抽调人员补充方阵。主题教育简报已发,基层党组织学习已完成。 +区表彰、主题教育与招待费管理:区级担当作为表彰正在对接,建议争取参与以拉开竞争差距。招待费实行每年公开1次制度,综合部超预算需调整26年预算,其他部门严控成本。政企对外接待需统筹,严禁客户经理个人垫资。 +拆迁商客、满意度考核与KPI准备:拆迁以二次升套为主,社区与单位集中营销并行。满意度测评发现开卷考试形式导致拉分,相关经理思想松懈。KPI考核已明确工信部有责及投诉率两项核心指标,强调日清日结与执行力。 +决策事项 +二级基站拆除及下电服务费调整需在4月15日前全量完成。 +招待费实行每年公开1次制度,综合部超预算需调整26年预算,其他部门严控成本。 +满意度测评取消开卷考试形式,后续采用“后机评”模式,市场部需按四公司规定制定满意客户样板及不满客户处理流程。 +第四届体育文化节方阵缺编9人,需各部门协调抽调,确定名单后统一采购服装并安排下班后排练。 +年度考核指标已初步定调,各部门需提前与市公司沟通争取有利政策,避免起跑线落后。 +待办事项 +宽带/客服部:跟进学校IP限速机制建立;处理客服内部机房问题清单并与客户沟通。 +综合部:完成食堂改造审计及自饮机引入批复跟进;落实招待费公示及纪检报备。 +市场部:本周内汇报招聘进度、农村渠道进度及营销方案;每日微信发送满意度日报。 +各部门:今日确定体育文化节方阵补充人员名单;针对专线助账客指标拿出具体保障方案并回复。 +关键信息 +会议时间:2026-05-06 13:37 +核心考核导向:KPI考核聚焦工信部有责与投诉率,强调执行力与日清日结习惯。 +业务风险点:PCDN专线恶化、满意度测评因形式问题导致拉分、招待费预算超支。 +AI建议 +针对满意度考核风险,建议市场部立即复盘测评机制,避免形式主义拉低指标,并提前演练“后机评”应对策略。 +针对招待费超支及客户经理个人垫资问题,建议综合部建立统一审批与结算台账,明确费用归属与报销时效,规避财务与合规风险。 +针对KPI考核准备,建议建立市公司指标动态跟踪表,提前模拟考核场景,强化跨部门协同与数据预埋,确保年底考核不被动。 \ No newline at end of file diff --git a/meeting_example.md b/meeting_example.md new file mode 100644 index 0000000..89ed8da --- /dev/null +++ b/meeting_example.md @@ -0,0 +1,75 @@ + + +# 会议记录 + +议 题:合川分公司周例会(2026第X期) + +时 间:2026年5月6日 13:37—14:23 + +地 点:分公司会议室 + +主持人:AlanPaine + +参加人:分公司领导、各部门经理及相关人员 + +议程: + + 一、各部门汇报 + + 二、分公司领导指示部署 + +--- + +## 会议内容 + +### 一、各部门汇报 + +建维部、综合部、商客市场负责人按议程现场按顺序做汇报。建维部汇报宽带安装受天气影响进度偏后,弱光指标0.51持续向好,三代终端年度目标5.5需持续压降,九零工程月度转化率87.35%接近90%目标,退单率6.53%,PCDN专线学校出口问题正协调限速机制,二级基站拆除预计4月中旬完成;综合部通报建委相关工作清单及投资计划已汇报,打印设备已协调保障招投标需求,工会经费压减后严考严用,食堂改造及自饮机引入方案正在推进,第四届体育文化节方阵人员招募与排练已部署;商客市场2月收入88.5万元实现增长,三期项目二期拆迁完成1145户,社区与单位清洗服务5场落实签约量待提升。 + +--- + +### 二、部署强调 + +#### 建维部负责人强调: + +1. **网络运维与指标管控:** + - 弱光指标0.51持续向好,三代终端年度目标5.5需持续压降,FPTR已达标但主动过境0.3靠后。 + - 九零工程月度转化率87.35%接近90%目标,退单率6.53%,主要受用户原因及改约影响,已建议施工优化BtoC审核撤单流程。 + - PCDN专线因学校出口带宽问题持续恶化,正协调限速机制;超频基站故障已及时处理,专线巡检按计划推进。 + +2. **工作反馈与执行要求:** + - 强调养成“日清日结”习惯,工作回复必须量化、有结果、有措施,杜绝工作拖延数月未动。 + - 针对关键业务上量指标缺乏保障措施问题,要求本周内出具具体可行方案并明确责任人。 + +--- + +#### 综合部负责人强调: + +1. **工会经费与后勤保障:** + - 工会经费全面压减,需严考严用、以更少资金办更好实事。软性工程(更衣室、食堂改造)已确定上报市公司,拟引入自饮机解决饮水问题并节约成本。 + +2. **奖项申报策略:** + - 针对河川区“担当作为先进集体和先进个人”申报,评选条件多为定性要求,建议提前与区领导或分管领导沟通确认意向后再行申报,避免盲目提交浪费资源。 + +--- + +#### 市场部负责人强调: + +1. **季度收官与二季度谋划:** + - 市场部需提前谋划季度收官及二季度业务活动,打破淡季思维,全力推动商客、H业务及AI军团活动升温。 + - 本周内完成招聘情况、农村渠道进度及营销方案汇报;本周六上午视频汇报运动会筹备情况。 + +2. **满意度与考核管控:** + - 深刻反思满意度测评前期工作未做到位问题,要求主管亲自抓。明确满意度及投诉考核标准,不满客户需严格按5:30及5:35节点操作报警。 + - 要求商客经理每日微信发送日报,跟进考核细节,确保指标可控。 + +--- + +#### 分公司主要领导强调: + +1. **强化执行力与作风:** + - 各部门及一线人员必须摒弃“知道怎么做却不去做”的作风,做到事不做好不收兵。分管领导需加强政企部等部门督导力度,必要时亲自沟通。 + +2. **年度考核提前摸底:** + - 针对四公司年度考核及集团相关指标提升,要求各部门提前深入了解考核细则及可能产生重大影响的不利因素并及时上报,切忌定稿后被动。 + - 市公司会统筹考虑分公司整体情况,务必提前布局、赢在起跑线。 \ No newline at end of file diff --git a/meeting_processor.py b/meeting_processor.py new file mode 100644 index 0000000..17f4254 --- /dev/null +++ b/meeting_processor.py @@ -0,0 +1,174 @@ +import hashlib +import logging +import os +from typing import Optional + +from config import config +from extractor import MeetingExtraction, extract_meeting_info +from graph_store import graph_store +from meeting_state import MeetingStateStore +from raw_store import raw_meeting_store +from vector_store import meeting_vector_store + +logger = logging.getLogger(__name__) + +state_store = MeetingStateStore(config.state_path) + + +class MeetingProcessor: + def process_meeting_file(self, filepath: str, force: bool = False) -> Optional[str]: + with open(filepath, "r", encoding="utf-8") as f: + text = f.read() + return self.process_meeting_text(text, force=force) + + def process_meeting_text(self, text: str, force: bool = False) -> Optional[str]: + content_hash = self._compute_content_hash(text) + + if not force and state_store.has_content_hash(content_hash): + print("\n检测到重复内容,已跳过。") + logger.info("Duplicate content hash skipped: %s", content_hash[:12]) + return None + + if not force: + similar = meeting_vector_store.find_similar_text(text, threshold=0.92) + if similar: + meta = similar["metadata"] + print( + f"\n发现相似会议:{meta.get('title', '')} ({meta.get('date', '')}) " + f"相似度 {similar['score']:.2%}" + ) + while True: + choice = input("选择 [s]跳过 / [o]覆盖(默认 s):").strip().lower() or "s" + if choice == "s": + logger.info("Skipped similar meeting: %s", meta.get("title", "")) + return None + if choice == "o": + force = True + break + print("请输入 s 或 o。") + + meeting_data = self._extract(text) + if not meeting_data: + logger.error("Failed to extract meeting information") + return None + + data_dict = meeting_data.model_dump() + data_dict["_content_hash"] = content_hash + data_dict["_graph_meeting_id"] = meeting_vector_store._meeting_id(data_dict) + + should_skip = self._handle_duplicate(data_dict, force) + if should_skip: + return None + + meeting_title = data_dict.get("title", "") + meeting_date = data_dict.get("date", "") + raw_path = raw_meeting_store.save(text, title=meeting_title, date=meeting_date) + + data_dict["_original_text"] = text + data_dict["_original_text_path"] = raw_path + + meeting_filename = f"{meeting_vector_store._meeting_id(data_dict)}.md" + + data_dict["action_items"] = state_store.merge_action_items( + data_dict.get("action_items", []), + meeting_title, + meeting_date, + meeting_filename, + ) + data_dict["metrics"] = state_store.merge_metrics( + data_dict.get("metrics", []), + meeting_title, + meeting_date, + meeting_filename, + ) + + state_store.add_content_hash(content_hash, meeting_title, meeting_date, meeting_filename) + state_store.save() + meeting_vector_store.add_meeting(data_dict) + graph_store.upsert_meeting_subgraph(data_dict) + + logger.info("Meeting processed: %s", meeting_title) + return raw_path + + def _handle_duplicate(self, data_dict: dict, force: bool) -> bool: + title = data_dict.get("title", "") + date = data_dict.get("date", "") + existing = meeting_vector_store.find_meeting(title, date) + + if not existing: + return False + + if force: + logger.info("Duplicate meeting found; overwriting in force mode: %s", title) + self._remove_old(data_dict, existing) + return False + + print(f"\n发现重复会议:{title} ({date})") + while True: + choice = input("选择 [s]跳过 / [o]覆盖(默认 s):").strip().lower() or "s" + if choice == "s": + logger.info("Skipped duplicate meeting: %s", title) + return True + if choice == "o": + self._remove_old(data_dict, existing) + return False + print("请输入 s 或 o。") + + def _remove_old(self, data_dict: dict, existing: Optional[dict] = None): + meeting_id = meeting_vector_store._meeting_id(data_dict) + meeting_vector_store.remove_meeting(meeting_id) + graph_store.remove_meeting_subgraph(meeting_id) + + new_hash = data_dict.get("_content_hash", "") + if new_hash: + state_store.remove_content_hash(new_hash) + + if existing: + old_hash = existing.get("content_hash", "") + if old_hash and old_hash != new_hash: + state_store.remove_content_hash(old_hash) + + logger.info("Removed old meeting artifacts: %s", data_dict.get("title", "")) + + def _compute_content_hash(self, text: str) -> str: + normalized = text.strip().replace("\r\n", "\n") + return hashlib.sha256(normalized.encode("utf-8")).hexdigest() + + def _extract(self, text: str) -> Optional[MeetingExtraction]: + try: + return extract_meeting_info(text) + except Exception as exc: + logger.error("LLM extraction failed: %s", exc) + return None + + def query(self, question: str, top_k: int = 3) -> str: + vector_context = meeting_vector_store.query_as_context(question, top_k=top_k) + graph_results = graph_store.search_facts(question, limit=top_k) + + parts = [] + if vector_context: + parts.append("=== Vector Context ===\n" + vector_context) + + if graph_results: + graph_lines = [] + for idx, row in enumerate(graph_results, start=1): + title = row.get("title", row.get("kind", "graph")) + text = row.get("text", "") + date = row.get("date", "") + suffix = f" ({date})" if date else "" + graph_lines.append(f"[{idx}] {title}{suffix}\n{text}") + parts.append("=== Graph Facts ===\n" + "\n\n".join(graph_lines)) + + return "\n\n".join(parts) + + def stats(self) -> dict: + return { + "vector_index": meeting_vector_store.get_stats(), + "graph": graph_store.get_stats(), + "state": state_store.get_stats(), + "raw_dir": config.storage.raw_dir, + "state_path": config.state_path, + } + + +meeting_processor = MeetingProcessor() diff --git a/meeting_state.py b/meeting_state.py new file mode 100644 index 0000000..e643acf --- /dev/null +++ b/meeting_state.py @@ -0,0 +1,189 @@ +import hashlib +import json +import logging +import os +from datetime import datetime +from typing import Dict, List, Optional + +logger = logging.getLogger(__name__) + + +def _item_id(task: str, assignee: str) -> str: + raw = f"{task}|{assignee}" + return hashlib.md5(raw.encode("utf-8")).hexdigest()[:8] + + +def _metric_id(metric_name: str, owner: str) -> str: + raw = f"{metric_name}|{owner}" + return hashlib.md5(raw.encode("utf-8")).hexdigest()[:8] + + +class MeetingStateStore: + def __init__(self, state_path: str): + self.state_path = state_path + self._state = self._load() + + def _load(self) -> dict: + if os.path.exists(self.state_path): + try: + with open(self.state_path, "r", encoding="utf-8") as f: + return json.load(f) + except Exception as e: + logger.warning(f"加载状态文件失败,将创建新状态: {e}") + return { + "action_items": {}, + "metrics": {}, + "meeting_series": {}, + "content_hashes": {}, + } + + def save(self): + os.makedirs(os.path.dirname(self.state_path), exist_ok=True) + with open(self.state_path, "w", encoding="utf-8") as f: + json.dump(self._state, f, ensure_ascii=False, indent=2) + + def _ensure_series(self, meeting_title: str, meeting_date: str) -> str: + series_name = self._detect_series(meeting_title) + series = self._state["meeting_series"].get(series_name) + if not series: + series = {"latest_date": meeting_date, "processed_titles": []} + self._state["meeting_series"][series_name] = series + if meeting_date > series.get("latest_date", ""): + series["latest_date"] = meeting_date + if meeting_title not in series["processed_titles"]: + series["processed_titles"].append(meeting_title) + return series_name + + def _detect_series(self, title: str) -> str: + import re + cleaned = re.sub(r"(\d{4}第\w+期)", "", title) + cleaned = re.sub(r"\(\d{4}第\w+期\)", "", cleaned) + cleaned = re.sub(r"\d{4}第\w+期", "", cleaned) + cleaned = re.sub(r"\d{4}年第\w+次", "", cleaned) + cleaned = cleaned.strip("-_ ") + return cleaned or title + + def merge_action_items( + self, + new_items: List[dict], + meeting_title: str, + meeting_date: str, + meeting_filename: str, + ) -> List[dict]: + series_name = self._ensure_series(meeting_title, meeting_date) + merged = [] + + for item in new_items: + task = item.get("task", "") + assignee = item.get("assignee", "") + iid = _item_id(task, assignee) + + history_entry = { + "date": meeting_date, + "meeting": meeting_filename, + "status": item.get("status", "待办"), + "priority": item.get("priority", "中"), + "deadline": item.get("deadline", ""), + } + + existing = self._state["action_items"].get(iid) + if existing: + existing["history"].append(history_entry) + existing["latest"] = history_entry + latest = existing["history"][-1] + item["_item_id"] = iid + item["_history"] = list(existing["history"]) + item["status"] = latest["status"] + item["priority"] = latest["priority"] + item["deadline"] = latest["deadline"] + else: + self._state["action_items"][iid] = { + "item_id": iid, + "task": task, + "assignee": assignee, + "series": series_name, + "created_meeting": meeting_filename, + "history": [history_entry], + "latest": history_entry, + } + item["_item_id"] = iid + item["_history"] = [history_entry] + + merged.append(item) + + return merged + + def merge_metrics( + self, + new_metrics: List[dict], + meeting_title: str, + meeting_date: str, + meeting_filename: str, + ) -> List[dict]: + merged = [] + + for m in new_metrics: + metric_name = m.get("metric_name", "") + owner = m.get("owner", "") + mid = _metric_id(metric_name, owner) + + history_entry = { + "date": meeting_date, + "meeting": meeting_filename, + "value": m.get("value", ""), + "target": m.get("target", ""), + "trend": m.get("trend", ""), + } + + existing = self._state["metrics"].get(mid) + if existing: + existing["history"].append(history_entry) + existing["latest"] = history_entry + item = m + item["_metric_id"] = mid + item["_history"] = list(existing["history"]) + else: + self._state["metrics"][mid] = { + "metric_id": mid, + "metric_name": metric_name, + "owner": owner, + "history": [history_entry], + "latest": history_entry, + } + m["_metric_id"] = mid + m["_history"] = [history_entry] + + merged.append(m) + + return merged + + def get_action_item_history(self, item_id: str) -> Optional[dict]: + return self._state["action_items"].get(item_id) + + def get_metric_history(self, metric_id: str) -> Optional[dict]: + return self._state["metrics"].get(metric_id) + + def get_series_info(self, title: str) -> Optional[dict]: + series_name = self._detect_series(title) + return self._state["meeting_series"].get(series_name) + + def has_content_hash(self, content_hash: str) -> bool: + return content_hash in self._state["content_hashes"] + + def add_content_hash(self, content_hash: str, title: str, date: str, filename: str): + self._state["content_hashes"][content_hash] = { + "title": title, + "date": date, + "filename": filename, + } + + def remove_content_hash(self, content_hash: str): + self._state["content_hashes"].pop(content_hash, None) + + def get_stats(self) -> dict: + return { + "action_items_tracked": len(self._state["action_items"]), + "metrics_tracked": len(self._state["metrics"]), + "meeting_series": len(self._state["meeting_series"]), + "content_hashes": len(self._state["content_hashes"]), + } \ No newline at end of file diff --git a/raw_store.py b/raw_store.py new file mode 100644 index 0000000..6800ad3 --- /dev/null +++ b/raw_store.py @@ -0,0 +1,52 @@ +import logging +import os +from datetime import datetime + +from config import config + +logger = logging.getLogger(__name__) + + +def _sanitize_filename(name: str) -> str: + if not name: + return "untitled" + invalid = '<>:"/\\|?*' + for char in invalid: + name = name.replace(char, "") + name = name.replace(" ", "_").strip("._") + return name or "untitled" + + +class RawMeetingStore: + def __init__(self): + self.raw_dir = config.storage.raw_dir + os.makedirs(self.raw_dir, exist_ok=True) + + def save(self, text: str, title: str = "", date: str = "") -> str: + date_str = date or datetime.now().strftime("%Y-%m-%d") + safe_title = _sanitize_filename(title)[:60] + filename = f"{date_str}_{safe_title}.md" + filepath = os.path.join(self.raw_dir, filename) + + content = "\n".join( + [ + "---", + f'title: "{title}"', + f'date: "{date_str}"', + "status: archived", + "---", + "", + f"# {title or 'Untitled Meeting'}", + "", + text, + "", + ] + ) + with open(filepath, "w", encoding="utf-8") as f: + f.write(content) + + logger.info("Saved raw meeting text: %s", filepath) + return filepath + + +raw_meeting_store = RawMeetingStore() diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..1369a52 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,8 @@ +openai>=1.0.0 +pydantic>=2.0.0 +llama-index>=0.10.0 +llama-index-embeddings-openai>=0.1.0 +llama-index-vector-stores-chroma>=0.1.0 +chromadb>=0.5.0 +python-dotenv>=1.0.0 +neo4j>=5.26.0 diff --git a/vector_store.py b/vector_store.py new file mode 100644 index 0000000..6c22e10 --- /dev/null +++ b/vector_store.py @@ -0,0 +1,259 @@ +import hashlib +import json +import logging +import os +import re +from typing import List, Optional + +from openai import OpenAI as OpenAI_Client +from llama_index.core import ( + Document, + VectorStoreIndex, + StorageContext, + load_index_from_storage, +) +from llama_index.core.embeddings import BaseEmbedding +from llama_index.core.settings import Settings + +from config import config + +logger = logging.getLogger(__name__) + + +class CustomOpenAIEmbedding(BaseEmbedding): + def __init__( + self, + model: str = "text-embedding-ada-002", + api_key: Optional[str] = None, + api_base: Optional[str] = None, + **kwargs, + ): + super().__init__(model_name=model, **kwargs) + self._client = OpenAI_Client( + api_key=api_key or "not-needed", + base_url=api_base, + ) + self._model = model + + async def _aget_query_embedding(self, query: str) -> List[float]: + return self._get_embedding(query) + + async def _aget_text_embedding(self, text: str) -> List[float]: + return self._get_embedding(text) + + def _get_query_embedding(self, query: str) -> List[float]: + return self._get_embedding(query) + + def _get_text_embedding(self, text: str) -> List[float]: + return self._get_embedding(text) + + def _get_embedding(self, text: str) -> List[float]: + resp = self._client.embeddings.create( + model=self._model, + input=text, + ) + return resp.data[0].embedding + + +class MeetingVectorStore: + def __init__(self): + embed_model = CustomOpenAIEmbedding( + model=config.embedding.model, + api_key=config.embedding.api_key or None, + api_base=config.embedding.api_base if config.embedding.api_base else None, + ) + Settings.embed_model = embed_model + + self.persist_dir = config.vector_store.persist_dir + self._index: Optional[VectorStoreIndex] = None + self._load_or_create_index() + + def _load_or_create_index(self): + if os.path.exists(os.path.join(self.persist_dir, "docstore.json")): + try: + storage_context = StorageContext.from_defaults(persist_dir=self.persist_dir) + self._index = load_index_from_storage(storage_context) + logger.info(f"从磁盘加载向量索引: {self.persist_dir}") + return + except Exception as e: + logger.warning(f"加载向量索引失败,将创建新索引: {e}") + + self._index = VectorStoreIndex.from_documents([]) + logger.info("创建新的向量索引") + + def _save(self): + if self._index: + os.makedirs(self.persist_dir, exist_ok=True) + self._index.storage_context.persist(persist_dir=self.persist_dir) + + def _meeting_id(self, meeting_data: dict) -> str: + title = meeting_data.get("title", "") + date = meeting_data.get("date", "") + raw = f"{date}_{title}" + return f"meeting_{hashlib.md5(raw.encode('utf-8')).hexdigest()[:12]}" + + def find_meeting(self, title: str, date: str = "") -> Optional[dict]: + if not self._index: + return None + query_text = f"会议标题: {title}" + if date: + query_text += f" 日期: {date}" + try: + results = self.query(query_text, top_k=3) + for r in results: + meta = r.get("metadata", {}) + meta_title = meta.get("title", "") + if meta_title == title or (date and meta.get("date") == date): + return meta + return None + except Exception as e: + logger.warning(f"会议查重查询失败: {e}") + return None + + def find_similar_text(self, text: str, threshold: float = 0.92) -> Optional[dict]: + if not self._index: + return None + try: + retriever = self._index.as_retriever(similarity_top_k=3) + nodes = retriever.retrieve(text) + for node in nodes: + if node.score is not None and node.score > threshold: + return { + "metadata": node.metadata, + "score": node.score, + } + return None + except Exception as e: + logger.warning(f"文本相似度查重失败: {e}") + return None + + def remove_meeting(self, meeting_id: str) -> bool: + if not self._index: + return False + try: + for field in self._FIELD_TYPES: + self._index.delete_ref_doc(f"{meeting_id}_{field}") + self._save() + logger.info(f"已从向量索引移除会议: {meeting_id}") + return True + except Exception as e: + logger.warning(f"移除向量索引失败: {e}") + return False + + _FIELD_TYPES = ["header", "summary", "action_items", "metrics", "decisions", "relations", "entities"] + + def add_meeting(self, meeting_data: dict) -> bool: + try: + meeting_id = self._meeting_id(meeting_data) + original_text_path = meeting_data.get("_original_text_path", "") + original_text = meeting_data.get("_original_text", "") + + base_metadata = { + "title": meeting_data.get("title", ""), + "date": meeting_data.get("date", ""), + "participants": ", ".join(meeting_data.get("participants", [])), + "type": "meeting", + "content_hash": meeting_data.get("_content_hash", ""), + "original_text_path": original_text_path, + "original_text_excerpt": original_text[:500] if original_text else "", + "meeting_id": meeting_id, + } + + docs = self._build_field_docs(meeting_data, base_metadata, meeting_id) + + if self._index: + for doc in docs: + self._index.insert(doc) + self._save() + logger.info(f"会议 '{meeting_data.get('title')}' 已添加到向量索引 (id={meeting_id}, 字段数={len(docs)})") + return True + except Exception as e: + logger.error(f"添加会议到向量索引失败: {e}") + return False + + def _build_field_docs(self, data: dict, base: dict, meeting_id: str) -> List[Document]: + docs = [] + + header = f"# {data.get('title', '')}" + if data.get("date"): + header += f"\n日期: {data['date']}" + if data.get("participants"): + header += f"\n参会人: {', '.join(data['participants'])}" + docs.append(Document(text=header, metadata={**base, "field": "header"}, doc_id=f"{meeting_id}_header")) + + if data.get("summary"): + docs.append(Document(text=data["summary"], metadata={**base, "field": "summary"}, doc_id=f"{meeting_id}_summary")) + + if data.get("action_items"): + lines = [] + for item in data["action_items"]: + status = item.get('status', '待办') + lines.append(f"- [{status}] {item.get('task', '')} (负责人: {item.get('assignee', '')}, 截止: {item.get('deadline', '')}, 优先级: {item.get('priority', '')})") + history = item.get("_history", []) + if len(history) > 1: + lines.append(" 演变: " + " → ".join(f"{h.get('date','')}({h.get('status','')})" for h in history)) + docs.append(Document(text="\n".join(lines), metadata={**base, "field": "action_items"}, doc_id=f"{meeting_id}_action_items")) + + if data.get("metrics"): + lines = [] + for m in data["metrics"]: + lines.append(f"- {m.get('metric_name', '')}: {m.get('value', '')} (目标: {m.get('target', '')}, 趋势: {m.get('trend', '')})") + docs.append(Document(text="\n".join(lines), metadata={**base, "field": "metrics"}, doc_id=f"{meeting_id}_metrics")) + + if data.get("decisions"): + lines = [f"- {d.get('content', '')}" for d in data["decisions"]] + docs.append(Document(text="\n".join(lines), metadata={**base, "field": "decisions"}, doc_id=f"{meeting_id}_decisions")) + + if data.get("relations"): + lines = [f"- {r.get('subject', '')} --{r.get('predicate', '')}--> {r.get('object', '')}" for r in data["relations"]] + docs.append(Document(text="\n".join(lines), metadata={**base, "field": "relations"}, doc_id=f"{meeting_id}_relations")) + + if data.get("entities"): + lines = [f"- [{e.get('entity_type', '')}] {e.get('name', '')}: {e.get('description', '')}" for e in data["entities"]] + docs.append(Document(text="\n".join(lines), metadata={**base, "field": "entities"}, doc_id=f"{meeting_id}_entities")) + + return docs + + def query(self, question: str, top_k: int = 5) -> List[dict]: + if not self._index: + return [] + try: + retriever = self._index.as_retriever(similarity_top_k=top_k) + nodes = retriever.retrieve(question) + results = [] + for node in nodes: + results.append({ + "text": node.text, + "score": node.score, + "metadata": node.metadata, + }) + return results + except Exception as e: + logger.error(f"查询向量索引失败: {e}") + return [] + + def query_as_context(self, question: str, top_k: int = 3) -> str: + results = self.query(question, top_k=top_k) + if not results: + return "" + parts = [] + for i, r in enumerate(results): + metadata = r.get("metadata", {}) + parts.append(f"[{i+1}] {metadata.get('title', '未知会议')} ({metadata.get('date', '')})\n{r['text']}\n") + return "\n".join(parts) + + def get_stats(self) -> dict: + if not self._index: + return {"doc_count": 0, "node_count": 0} + try: + docstore = self._index.docstore + docs = list(docstore.docs.values()) if hasattr(docstore, 'docs') else [] + return { + "doc_count": len(docstore.docs) if hasattr(docstore, 'docs') else 0, + "node_count": len(docs), + } + except Exception: + return {"doc_count": 0, "node_count": 0} + + +meeting_vector_store = MeetingVectorStore() \ No newline at end of file