"""Detect scanned PDFs and extract text via OCR when native text is sparse."""

from __future__ import annotations

import io
import logging
import os
import shutil
from functools import lru_cache
from typing import Any

import pdfplumber

from app.core.config import get_settings
from app.parsers.text_normalize import normalize_pdf_text

logger = logging.getLogger(__name__)

MAX_PAGES = 30
MAX_PAGE_TEXT_CHARS = 8000
MIN_TOTAL_NATIVE_CHARS = 120
MIN_PAGE_NATIVE_CHARS = 25

_OCR_STATUS: str | None = None  # ready | disabled | unavailable


def _common_tesseract_paths() -> list[str]:
    return [
        os.environ.get("TESSDATA_PREFIX", "").strip(),
        r"C:\Program Files\Tesseract-OCR\tessdata",
        r"C:\Program Files (x86)\Tesseract-OCR\tessdata",
        "/usr/share/tesseract-ocr/5/tessdata",
        "/usr/share/tesseract-ocr/4.00/tessdata",
        "/usr/local/share/tessdata",
    ]


def configure_tesseract_env() -> None:
    """Apply OCR paths from settings / common install locations."""
    settings = get_settings()
    if settings.tessdata_prefix:
        os.environ["TESSDATA_PREFIX"] = settings.tessdata_prefix
    elif not os.environ.get("TESSDATA_PREFIX"):
        for candidate in _common_tesseract_paths():
            if candidate and os.path.isdir(candidate):
                os.environ["TESSDATA_PREFIX"] = candidate
                break

    if settings.tesseract_cmd:
        os.environ["TESSERACT_CMD"] = settings.tesseract_cmd
        try:
            import pytesseract

            pytesseract.pytesseract.tesseract_cmd = settings.tesseract_cmd
        except ImportError:
            pass


@lru_cache(maxsize=1)
def ocr_status() -> str:
    """
    Return OCR readiness: ready, disabled, or unavailable.
    Probed once per process to avoid per-page error spam.
    """
    global _OCR_STATUS
    if _OCR_STATUS is not None:
        return _OCR_STATUS

    settings = get_settings()
    if not settings.ocr_enabled:
        _OCR_STATUS = "disabled"
        logger.info("PDF OCR disabled via OCR_ENABLED=false")
        return _OCR_STATUS

    configure_tesseract_env()

    tessdata = os.environ.get("TESSDATA_PREFIX", "").strip()
    if tessdata and os.path.isdir(tessdata):
        _OCR_STATUS = "ready"
        logger.info("PDF OCR ready (TESSDATA_PREFIX=%s)", tessdata)
        return _OCR_STATUS

    tesseract_bin = (
        get_settings().tesseract_cmd
        or os.environ.get("TESSERACT_CMD")
        or shutil.which("tesseract")
    )
    if tesseract_bin:
        _OCR_STATUS = "ready"
        logger.info("PDF OCR ready (tesseract=%s)", tesseract_bin)
        return _OCR_STATUS

    _OCR_STATUS = "unavailable"
    logger.info(
        "PDF OCR unavailable: Tesseract not installed. "
        "Text-based PDFs still work; scanned/image PDFs need OCR. "
        "Run api/install-ocr.ps1 or set TESSDATA_PREFIX in api/.env"
    )
    return _OCR_STATUS


def is_likely_scanned(pages: list[dict[str, Any]]) -> bool:
    if not pages:
        return True
    total_chars = sum(len((page.get("text") or "").strip()) for page in pages)
    if total_chars < MIN_TOTAL_NATIVE_CHARS:
        return True
    sparse_pages = sum(1 for page in pages if len((page.get("text") or "").strip()) < MIN_PAGE_NATIVE_CHARS)
    return sparse_pages >= max(1, len(pages) // 2)


def ocr_text_pages_fitz(file_bytes: bytes, *, max_pages: int = MAX_PAGES) -> list[dict[str, Any]] | None:
    if ocr_status() != "ready":
        return None

    try:
        import fitz
    except ImportError:
        return None

    tessdata = os.environ.get("TESSDATA_PREFIX") or None
    pages: list[dict[str, Any]] = []
    try:
        doc = fitz.open(stream=file_bytes, filetype="pdf")
        for index, page in enumerate(doc):
            if index >= max_pages:
                break
            text = (page.get_text() or "").strip()
            if len(text) < MIN_PAGE_NATIVE_CHARS:
                try:
                    textpage = page.get_textpage_ocr(dpi=200, full=True, tessdata=tessdata)
                    text = (page.get_text(textpage=textpage) or "").strip()
                except Exception as exc:
                    logger.debug("PyMuPDF OCR skipped on page %s: %s", index + 1, exc)
            page_text = normalize_pdf_text(text)
            if len(page_text) > MAX_PAGE_TEXT_CHARS:
                page_text = page_text[:MAX_PAGE_TEXT_CHARS]
            pages.append({"page": index + 1, "text": page_text})
    except Exception as exc:
        logger.warning("PyMuPDF OCR failed: %s", exc)
        return None
    return pages if pages else None


def ocr_text_pages_tesseract(file_bytes: bytes, *, max_pages: int = MAX_PAGES) -> list[dict[str, Any]] | None:
    if ocr_status() != "ready":
        return None

    try:
        from pdf2image import convert_from_bytes
        import pytesseract
    except ImportError:
        return None

    from app.utils.pdf_image_preprocess import preprocess_image_for_ocr

    configure_tesseract_env()
    pages: list[dict[str, Any]] = []
    try:
        images = convert_from_bytes(
            file_bytes,
            dpi=200,
            first_page=1,
            last_page=max_pages,
            fmt="jpeg",
        )
        for index, image in enumerate(images, start=1):
            processed = preprocess_image_for_ocr(image)
            raw = pytesseract.image_to_string(processed)
            page_text = normalize_pdf_text(raw or "")
            if len(page_text) > MAX_PAGE_TEXT_CHARS:
                page_text = page_text[:MAX_PAGE_TEXT_CHARS]
            pages.append({"page": index, "text": page_text})
    except Exception as exc:
        logger.warning("Tesseract OCR failed: %s", exc)
        return None
    return pages if pages else None


def ocr_text_pages(file_bytes: bytes, *, max_pages: int = MAX_PAGES) -> tuple[list[dict[str, Any]], str | None]:
    """Return OCR pages and engine name (`fitz`, `tesseract`, or None)."""
    if ocr_status() != "ready":
        return [], None

    fitz_pages = ocr_text_pages_fitz(file_bytes, max_pages=max_pages)
    if fitz_pages and sum(len(p["text"]) for p in fitz_pages) >= MIN_TOTAL_NATIVE_CHARS:
        return fitz_pages, "fitz"

    tess_pages = ocr_text_pages_tesseract(file_bytes, max_pages=max_pages)
    if tess_pages and sum(len(p["text"]) for p in tess_pages) >= MIN_TOTAL_NATIVE_CHARS:
        return tess_pages, "tesseract"

    if fitz_pages:
        return fitz_pages, "fitz"
    if tess_pages:
        return tess_pages, "tesseract"
    return [], None


def ocr_engine_label(engine: str | None, *, used_opencv: bool = True) -> str | None:
    if not engine:
        return None
    if engine == "tesseract" and used_opencv:
        return "tesseract+opencv"
    return engine


def native_char_count(file_bytes: bytes, *, max_pages: int = MAX_PAGES) -> int:
    total = 0
    with pdfplumber.open(io.BytesIO(file_bytes)) as pdf:
        for page in pdf.pages[:max_pages]:
            total += len((page.extract_text() or "").strip())
    return total
