#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
pdf2ai (rev4)
- Q/A detection is strict against false positives (e.g., "Q&A" headlines)
  but now permits colon-less single-line speaker labels:
    Q side: line-start + (colon OR single-line label)
    A side: line-start + (colon OR single-line label)
- Examples counted as dialogue:
    User\n<text>\nChatGPT\n<text>         (colon-less, single-line labels)
    Q: ...\nA: ...                        (colon labels)
- CSV adds "合計○,<count>" at the end.
- Per-year summary written as CSV-like "YYYY-MM, <count>" to summary-YYYY.txt.

Usage:
  python pdf2ai 2025-10.pdf
  python pdf2ai "/path/to/2024-*.pdf"
"""
import sys, os, re, glob, calendar
from typing import List, Tuple, Dict
from collections import defaultdict

try:
    from PyPDF2 import PdfReader
except Exception as e:
    print("ERROR: PyPDF2 is required to run this script. Please install with 'pip install PyPDF2'.", file=sys.stderr)
    raise

def _unsplit_digits(s: str) -> str:
    return re.sub(r'\s+', '', s)

DATE_RE = re.compile(
    r'(?P<ymd>'
    r'(?P<Y>\d(?:\s*\d){3})\s*年\s*'
    r'(?P<M>\d(?:\s*\d)?)\s*[月⽉]\s*'
    r'(?P<D>\d(?:\s*\d)?)\s*[日⽇]'
    r')',
    flags=re.UNICODE
)

# Q/A speaker labels must be at line start (multiline). Full/half colon accepted.
# Additionally, allow *colon-less* single-line labels (just the speaker name on its own line).
Q_MARK = re.compile(
    r'(?im)^[ \t　]*(?:'
    r'(?:Q|Ｑ)[ \t　]*[：:]'                         # Q: / Ｑ：
    r'|(?:User|Users|ユーザー)[ \t　]*[：:]'        # User: / Users: / ユーザー：
    r'|Ｑ（[^）]*）[ \t　]*[：:]'                   # Ｑ（…）：
    r'|(?:User|Users|ユーザー)[ \t　]*$'            # ← colon-less single-line label
    r')'
)

A_MARK = re.compile(
    r'(?im)^[ \t　]*(?:'
    r'(?:A|Ａ)[ \t　]*[：:]'                        # A: / Ａ：
    r'|(?:ChatGPT|Gemini|Claude|Perplexity|Grok|DeepSeek|Qwen|Assistant|アシスタント)[ \t　]*[：:]'
    r'|Ａ（[^）]*）[ \t　]*[：:]'
    r'|(?:ChatGPT|Gemini|Claude|Perplexity|Grok|DeepSeek|Qwen|Assistant|アシスタント)[ \t　]*$' # ← colon-less single-line label
    r')'
)

# Guard against "Q&A" headings that are not dialogues.
QANDA = re.compile(r'(?i)Q\s*&?\s*A')

def extract_text_from_pdf(path: str) -> str:
    reader = PdfReader(path)
    chunks = []
    for page in reader.pages:
        try:
            text = page.extract_text() or ""
        except Exception:
            text = ""
        chunks.append(text)
    return "\n".join(chunks)

def split_by_dates(month_text: str, y: int, m: int):
    matches = list(DATE_RE.finditer(month_text))
    pts = []
    for mobj in matches:
        Y = int(_unsplit_digits(mobj.group('Y')))
        M = int(_unsplit_digits(mobj.group('M')))
        D = int(_unsplit_digits(mobj.group('D')))
        pts.append((Y, M, D, mobj.start(), mobj.end()))
    filtered = []
    reached_first_day = False
    for (Y, M, D, s, e) in pts:
        if Y == y and M == m:
            filtered.append((D, s, e))
            if D == 1:
                reached_first_day = True
        else:
            if reached_first_day:
                break
            else:
                continue
    ranges = []
    for i, (D, s, e) in enumerate(filtered):
        sidx = s
        n_sidx = filtered[i+1][1] if (i+1) < len(filtered) else len(month_text)
        ranges.append((D, (sidx, n_sidx)))
    return ranges

def has_qa_dialogue(text_block: str) -> bool:
    # If "Q&A" is present but we don't have proper labels, do not count.
    if QANDA.search(text_block) and not (Q_MARK.search(text_block) and A_MARK.search(text_block)):
        return False
    q = Q_MARK.search(text_block)
    if not q:
        return False
    a = A_MARK.search(text_block, q.start())
    return a is not None

def analyze_month(pdf_path: str):
    base = os.path.basename(pdf_path)
    mm = re.search(r'(\d{4})-(\d{2})', base)
    if not mm:
        raise ValueError(f"Cannot infer YYYY-MM from filename: {base}")
    y = int(mm.group(1)); m = int(mm.group(2))
    month_key = f"{y:04d}-{m:02d}"

    text = extract_text_from_pdf(pdf_path)
    day_ranges = split_by_dates(text, y, m)
    day_text = {d: text[s:e] for d, (s, e) in day_ranges}

    _, last_day = calendar.monthrange(y, m)
    results = {}
    for d in range(1, last_day+1):
        block = day_text.get(d, "")
        mark = "○" if (block and has_qa_dialogue(block)) else "△"
        results[d] = mark
    return month_key, results

def write_csv(month_key: str, results, outdir: str) -> str:
    out_path = os.path.join(outdir, f"{month_key}.csv")
    o_count = sum(1 for v in results.values() if v == "○")
    with open(out_path, "w", encoding="utf-8") as f:
        for d in sorted(results.keys()):
            f.write(f"{d},{results[d]}\n")
        f.write(f"合計○,{o_count}\n")
    return out_path

def main(argv):
    if len(argv) < 2:
        print("Usage: python ai.py <PDF_PATTERN> [<PDF_PATTERN> ...]", file=sys.stderr)
        return 1
    files = []
    for pat in argv[1:]:
        files.extend(glob.glob(pat))
    if not files:
        print("No files matched.", file=sys.stderr)
        return 2

    outdir = os.getcwd()
    summary_by_year = defaultdict(list)

    for path in sorted(files):
        try:
            month_key, results = analyze_month(path)
        except Exception as e:
            print(f"[WARN] Skipping {path}: {e}", file=sys.stderr)
            continue
        csv_path = write_csv(month_key, results, outdir)
        o_count = sum(1 for v in results.values() if v == "○")
        y = int(month_key[:4])
        summary_by_year[y].append((month_key, o_count))
        print(f"[OK] {path} -> {csv_path} (○={o_count})")

    for year, rows in summary_by_year.items():
        rows_sorted = sorted(rows, key=lambda x: x[0])
        spath = os.path.join(outdir, f"summary-{year}.txt")
        with open(spath, "w", encoding="utf-8") as f:
            for mk, oc in rows_sorted:
                f.write(f"{mk}, {oc}\n")
        print(f"[OK] Wrote {spath}")
    return 0

if __name__ == "__main__":
    sys.exit(main(sys.argv))