#!/usr/bin/env python3
"""
项目任务扫描同步器
==================
扫描各项目空间的 MEMORY.md 和最新日志，提取任务条目，
去重后合并到日程系统 tasks.json。
由每日推送自动化调用，保证项目空间的任务自动汇入个人日程。
"""

import json
import re
from datetime import date, timedelta
from pathlib import Path
from copy import deepcopy

BASE = Path(__file__).parent
WORKBUDDY = Path.home() / "WorkBuddy"
DATA_DIR = BASE / "data"


def load_json(path):
    if not path.exists():
        return {}
    with open(path, "r", encoding="utf-8") as f:
        return json.load(f)


def save_json(path, data):
    with open(path, "w", encoding="utf-8") as f:
        json.dump(data, f, ensure_ascii=False, indent=2)


# ── 解析规则 ──────────────────────────────────

# MEMORY.md 中的任务模式
TODO_PATTERNS = [
    # "待补：xxx"
    (r"待补[：:]\s*(.+?)(?:$|\n)", "P3"),
    # "待开发" / "❌ 待开发" (non-table context)
    (r"[❌⚠]?\s*待开发[：:]?\s*([^\|\n]+?)(?:$|\n)", "P2"),
    # Markdown table row: "| xxx | xxx | ❌ 待开发" 
    (r"\|\s*(\S+)\s*\|\s*([^|]+?)\s*\|\s*❌\s*待开发", "P2"),
    # "待确认：xxx"
    (r"待确认[：:]\s*(.+?)(?:$|\n)", "P3"),
    # "□" or "☐" checkboxes
    (r"[□☐]\s+(.+?)(?:$|\n)", "P3"),
]

# 表匹配的后处理
def _clean_table_match(match):
    return f"{match.group(1)}{match.group(2)}"

TODO_TABLE_FIXERS = [0]  # 第3个pattern是表格匹配

# 标题黑名单（误匹配的进度描述）
TITLE_BLACKLIST = [
    r"理论逻辑梳理中",
    r"进度\s*\d+%",
    r"Q1-Q7草稿完成",
    r"三层设计草稿",
    r"画布引导确认",
]

# 日志中的活跃任务模式
LOG_TASK_PATTERNS = [
    # "*  待处理：xxx" or "*  待：xxx"
    (r"\*\s+待(?:处理)?[：:]\s*(.+?)(?:$|\n)", "P2"),
    # "*  TODO：xxx"
    (r"\*\s+TODO[：:]\s*(.+?)(?:$|\n)", "P2"),
    # "→ 下一步：xxx"
    (r"下一步[：:]\s*(.+?)(?:$|\n)", "P2"),
    # "明日起点：xxx"
    (r"明日起点[：:]\s*(.+?)(?:$|\n)", "P1"),
]

# 从 任务追踪 文件中提取
TASK_TRACKING_PATTERNS = [
    (r"^\d+\.\s+\*\*(.+?)\*\*[:：]?\s*(.+?)(?:$|\n)", "P2"),
    (r"^-\s+\[\s*[x ]\s*\]\s+(.+?)(?:$|\n)", "P2"),
]


def extract_tasks_from_text(text, patterns, source="memory"):
    """从文本中提取任务"""
    tasks = []
    seen = set()
    for pi, (pattern, default_priority) in enumerate(patterns):
        for m in re.finditer(pattern, text, re.MULTILINE):
            # 表格匹配用特殊提取
            if pi == 2:  # 表格 ❌ 待开发 模式
                title = f"{m.group(1)}{m.group(2)}"
            else:
                title = m.group(1).strip().rstrip(".")

            title = re.sub(r"\s{2,}", " ", title).strip()
            if len(title) < 3 or len(title) > 100:
                continue
            # 黑名单过滤
            if any(re.search(bl, title) for bl in TITLE_BLACKLIST):
                continue
            if title in seen:
                continue
            seen.add(title)
            tasks.append({
                "title": title,
                "priority": int(default_priority[1]),
                "source_pattern": pattern[:30],
                "source_type": source
            })
    return tasks


def scan_project(project_id, project_name, workspace_dir):
    """扫描单个项目的任务"""
    tasks = []

    # 1. 扫描 MEMORY.md
    memory_path = workspace_dir / ".workbuddy" / "memory" / "MEMORY.md"
    if memory_path.exists():
        text = memory_path.read_text(encoding="utf-8")
        tasks.extend(extract_tasks_from_text(text, TODO_PATTERNS, "memory"))

    # 2. 扫描最新 3 天日志
    log_dir = workspace_dir / ".workbuddy" / "memory"
    log_files = sorted(
        [f for f in log_dir.glob("20*.md") if f.name != "MEMORY.md"],
        reverse=True
    )[:3]
    for lf in log_files:
        text = lf.read_text(encoding="utf-8")
        # 找日志中的活跃任务
        day_ts = extract_tasks_from_text(text, LOG_TASK_PATTERNS, f"log:{lf.stem}")
        for t in day_ts:
            t["log_date"] = lf.stem
        tasks.extend(day_ts)

    # 3. 扫描任务追踪文件（如果存在）
    for tf_pattern in ["任务追踪", "任务看板", "TASKS", "TODO"]:
        for path in workspace_dir.rglob(f"*{tf_pattern}*"):
            if path.is_file() and path.suffix in (".md", ".txt"):
                text = path.read_text(encoding="utf-8")
                tasks.extend(extract_tasks_from_text(text, TASK_TRACKING_PATTERNS, f"tracking:{path.name}"))

    # 标注项目
    for t in tasks:
        t["project_id"] = project_id
        t["project_name"] = project_name

    return tasks


def merge_into_schedule(all_scanned_tasks):
    """去重合并到 tasks.json"""
    tasks_path = DATA_DIR / "tasks.json"
    data = load_json(tasks_path)
    existing = data.get("tasks", [])

    # 构建已有标题指纹（去重用）
    existing_fingerprints = set()
    for t in existing:
        fp = (t.get("project_id", ""), t.get("title", "").strip().lower())
        existing_fingerprints.add(fp)

    new_count = 0
    today = date.today().isoformat()

    for st in all_scanned_tasks:
        pid = st.get("project_id", "")
        title = st.get("title", "").strip()
        fp = (pid, title.lower())
        if fp in existing_fingerprints:
            continue

        # 生成新任务条目
        task = {
            "id": _next_id(existing),
            "category": "project",
            "project_id": pid,
            "title": title,
            "description": f"来源：{st.get('source_type', 'scan')} · {st.get('project_name', '')}",
            "estimated_minutes": 120,
            "priority": st.get("priority", 3),
            "deadline": None,
            "status": "pending",
            "scheduled_date": None,
            "scheduled_time": None,
            "tags": ["scanned", pid.replace("-", " ")],
            "source": "project_scan",
            "source_task_id": None,
            "created_at": f"{today}T08:00:00",
            "updated_at": f"{today}T08:00:00"
        }
        existing.append(task)
        existing_fingerprints.add(fp)
        new_count += 1

    data["tasks"] = existing
    data["_last_scan"] = today
    save_json(tasks_path, data)

    return new_count


def _next_id(existing):
    """生成下一个任务ID"""
    max_num = 0
    for t in existing:
        tid = t.get("id", "")
        m = re.match(r"t-(\d+)", tid)
        if m:
            max_num = max(max_num, int(m.group(1)))
    return f"t-{max_num + 1:03d}"


def run_scan(dry_run=False):
    """主扫描流程"""
    projects_path = DATA_DIR / "projects.json"
    proj_data = load_json(projects_path)
    projects = proj_data.get("projects", [])

    all_tasks = []
    report_lines = []

    for proj in projects:
        pid = proj["id"]
        name = proj["name"]
        ws = proj.get("workspace", "")
        if not ws:
            report_lines.append(f"- {name}：无需扫描（无工作区）")
            continue

        ws_dir = WORKBUDDY / ws
        if not ws_dir.exists():
            report_lines.append(f"- {name}：工作区不存在（{ws_dir}）")
            continue

        tasks = scan_project(pid, name, ws_dir)
        all_tasks.extend(tasks)
        report_lines.append(f"- {name}：发现 {len(tasks)} 个潜在任务")

    if dry_run:
        report_lines.append(f"\n合计：{len(all_tasks)} 个待合并任务（dry-run，未写入）")
        for t in all_tasks[:10]:
            report_lines.append(f"  · [{t.get('project_name', '?')}] {t['title']}")
        if len(all_tasks) > 10:
            report_lines.append(f"  ... 另有 {len(all_tasks) - 10} 项")
    else:
        new_count = merge_into_schedule(all_tasks)
        report_lines.append(f"\n合计：{len(all_tasks)} 个发现，{new_count} 个新增（已去重合并）")

    return "\n".join(report_lines)


if __name__ == "__main__":
    import sys
    dry = "--dry" in sys.argv or "-n" in sys.argv
    print(run_scan(dry_run=dry))
