#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ pdf2ai (rev4) - Q/A detection is strict against false positives (e.g., "Q&A" headlines) but now permits colon-less single-line speaker labels: Q side: line-start + (colon OR single-line label) A side: line-start + (colon OR single-line label) - Examples counted as dialogue: User\n\nChatGPT\n (colon-less, single-line labels) Q: ...\nA: ... (colon labels) - CSV adds "合計○," at the end. - Per-year summary written as CSV-like "YYYY-MM, " to summary-YYYY.txt. Usage: python pdf2ai 2025-10.pdf python pdf2ai "/path/to/2024-*.pdf" """ import sys, os, re, glob, calendar from typing import List, Tuple, Dict from collections import defaultdict try: from PyPDF2 import PdfReader except Exception as e: print("ERROR: PyPDF2 is required to run this script. Please install with 'pip install PyPDF2'.", file=sys.stderr) raise def _unsplit_digits(s: str) -> str: return re.sub(r'\s+', '', s) DATE_RE = re.compile( r'(?P' r'(?P\d(?:\s*\d){3})\s*年\s*' r'(?P\d(?:\s*\d)?)\s*[月⽉]\s*' r'(?P\d(?:\s*\d)?)\s*[日⽇]' r')', flags=re.UNICODE ) # Q/A speaker labels must be at line start (multiline). Full/half colon accepted. # Additionally, allow *colon-less* single-line labels (just the speaker name on its own line). Q_MARK = re.compile( r'(?im)^[ \t ]*(?:' r'(?:Q|Q)[ \t ]*[::]' # Q: / Q: r'|(?:User|Users|ユーザー)[ \t ]*[::]' # User: / Users: / ユーザー: r'|Q([^)]*)[ \t ]*[::]' # Q(…): r'|(?:User|Users|ユーザー)[ \t ]*$' # ← colon-less single-line label r')' ) A_MARK = re.compile( r'(?im)^[ \t ]*(?:' r'(?:A|A)[ \t ]*[::]' # A: / A: r'|(?:ChatGPT|Gemini|Claude|Perplexity|Grok|DeepSeek|Qwen|Assistant|アシスタント)[ \t ]*[::]' r'|A([^)]*)[ \t ]*[::]' r'|(?:ChatGPT|Gemini|Claude|Perplexity|Grok|DeepSeek|Qwen|Assistant|アシスタント)[ \t ]*$' # ← colon-less single-line label r')' ) # Guard against "Q&A" headings that are not dialogues. QANDA = re.compile(r'(?i)Q\s*&?\s*A') def extract_text_from_pdf(path: str) -> str: reader = PdfReader(path) chunks = [] for page in reader.pages: try: text = page.extract_text() or "" except Exception: text = "" chunks.append(text) return "\n".join(chunks) def split_by_dates(month_text: str, y: int, m: int): matches = list(DATE_RE.finditer(month_text)) pts = [] for mobj in matches: Y = int(_unsplit_digits(mobj.group('Y'))) M = int(_unsplit_digits(mobj.group('M'))) D = int(_unsplit_digits(mobj.group('D'))) pts.append((Y, M, D, mobj.start(), mobj.end())) filtered = [] reached_first_day = False for (Y, M, D, s, e) in pts: if Y == y and M == m: filtered.append((D, s, e)) if D == 1: reached_first_day = True else: if reached_first_day: break else: continue ranges = [] for i, (D, s, e) in enumerate(filtered): sidx = s n_sidx = filtered[i+1][1] if (i+1) < len(filtered) else len(month_text) ranges.append((D, (sidx, n_sidx))) return ranges def has_qa_dialogue(text_block: str) -> bool: # If "Q&A" is present but we don't have proper labels, do not count. if QANDA.search(text_block) and not (Q_MARK.search(text_block) and A_MARK.search(text_block)): return False q = Q_MARK.search(text_block) if not q: return False a = A_MARK.search(text_block, q.start()) return a is not None def analyze_month(pdf_path: str): base = os.path.basename(pdf_path) mm = re.search(r'(\d{4})-(\d{2})', base) if not mm: raise ValueError(f"Cannot infer YYYY-MM from filename: {base}") y = int(mm.group(1)); m = int(mm.group(2)) month_key = f"{y:04d}-{m:02d}" text = extract_text_from_pdf(pdf_path) day_ranges = split_by_dates(text, y, m) day_text = {d: text[s:e] for d, (s, e) in day_ranges} _, last_day = calendar.monthrange(y, m) results = {} for d in range(1, last_day+1): block = day_text.get(d, "") mark = "○" if (block and has_qa_dialogue(block)) else "△" results[d] = mark return month_key, results def write_csv(month_key: str, results, outdir: str) -> str: out_path = os.path.join(outdir, f"{month_key}.csv") o_count = sum(1 for v in results.values() if v == "○") with open(out_path, "w", encoding="utf-8") as f: for d in sorted(results.keys()): f.write(f"{d},{results[d]}\n") f.write(f"合計○,{o_count}\n") return out_path def main(argv): if len(argv) < 2: print("Usage: python ai.py [ ...]", file=sys.stderr) return 1 files = [] for pat in argv[1:]: files.extend(glob.glob(pat)) if not files: print("No files matched.", file=sys.stderr) return 2 outdir = os.getcwd() summary_by_year = defaultdict(list) for path in sorted(files): try: month_key, results = analyze_month(path) except Exception as e: print(f"[WARN] Skipping {path}: {e}", file=sys.stderr) continue csv_path = write_csv(month_key, results, outdir) o_count = sum(1 for v in results.values() if v == "○") y = int(month_key[:4]) summary_by_year[y].append((month_key, o_count)) print(f"[OK] {path} -> {csv_path} (○={o_count})") for year, rows in summary_by_year.items(): rows_sorted = sorted(rows, key=lambda x: x[0]) spath = os.path.join(outdir, f"summary-{year}.txt") with open(spath, "w", encoding="utf-8") as f: for mk, oc in rows_sorted: f.write(f"{mk}, {oc}\n") print(f"[OK] Wrote {spath}") return 0 if __name__ == "__main__": sys.exit(main(sys.argv))