"""Test learned regex patterns against example PDF corpus."""

from pathlib import Path

import pdfplumber

from app.parsers.registry import detect_company
from app.parsers.text_normalize import normalize_pdf_text
from app.utils.pattern_suggestion import extract_with_regex

EXAMPLE_PDF_DIR = Path(__file__).resolve().parents[2] / "example-pdf"


def run_pattern_on_samples(
    company_code: str,
    field_name: str,
    regex: str,
    *,
    max_files: int = 50,
) -> dict:
    """Run regex against all PDFs detected as company_code in example-pdf/."""
    if not EXAMPLE_PDF_DIR.exists():
        return {
            "company_code": company_code,
            "field_name": field_name,
            "regex": regex,
            "total_files": 0,
            "matched_files": 0,
            "results": [],
            "passed": True,
            "failed_count": 0,
            "failed_files": [],
            "error": "example-pdf directory not found",
        }

    results: list[dict] = []
    matched_files = 0

    pdf_files = sorted(EXAMPLE_PDF_DIR.glob("*.pdf"))[:max_files]
    for pdf_path in pdf_files:
        text = _extract_pdf_text(pdf_path)
        detected_code, _, _ = detect_company(text, pdf_path.name)
        if detected_code != company_code:
            continue

        value, ok = extract_with_regex(regex, text)
        matched = bool(ok and value)
        if matched:
            matched_files += 1
        results.append(
            {
                "file": pdf_path.name,
                "matched": matched,
                "value": value,
            }
        )

    failed = [row for row in results if not row["matched"]]
    total_files = len(results)
    passed = total_files == 0 or len(failed) == 0

    return {
        "company_code": company_code,
        "field_name": field_name,
        "regex": regex,
        "total_files": total_files,
        "matched_files": matched_files,
        "failed_count": len(failed),
        "failed_files": [row["file"] for row in failed],
        "passed": passed,
        "results": results,
    }


def evaluate_regression(company_code: str, field_name: str, regex: str) -> dict:
    """Return sample test results plus pass/fail gate for pattern save."""
    return run_pattern_on_samples(company_code, field_name, regex)


def test_pattern_on_text(regex: str, text: str) -> dict:
    value, ok = extract_with_regex(regex, text or "")
    return {"matched": ok, "value": value}


def _extract_pdf_text(pdf_path: Path) -> str:
    parts: list[str] = []
    with pdfplumber.open(pdf_path) as doc:
        for page in doc.pages[:10]:
            parts.append(page.extract_text() or "")
    return normalize_pdf_text("\n".join(parts))