"""Map PDF bounding boxes to extracted text (pdfplumber coordinates)."""

from __future__ import annotations

import io
import re
from typing import Any

import pdfplumber

from app.parsers.text_normalize import normalize_pdf_text
from app.utils.pdf_text import MAX_PAGES, PAGE_TEXT_SEPARATOR

Bbox = tuple[float, float, float, float]


def get_pdf_page_meta(file_bytes: bytes, *, max_pages: int = MAX_PAGES) -> list[dict[str, Any]]:
    pages: list[dict[str, Any]] = []
    with pdfplumber.open(io.BytesIO(file_bytes)) as pdf:
        for index, page in enumerate(pdf.pages[:max_pages]):
            pages.append(
                {
                    "page": index + 1,
                    "width": float(page.width),
                    "height": float(page.height),
                }
            )
    return pages


def normalize_bbox(bbox: dict[str, float] | list[float] | tuple[float, ...]) -> Bbox:
    if isinstance(bbox, dict):
        x0, top, x1, bottom = bbox["x0"], bbox["top"], bbox["x1"], bbox["bottom"]
    else:
        x0, top, x1, bottom = bbox
    left, right = sorted((float(x0), float(x1)))
    top_edge, bottom_edge = sorted((float(top), float(bottom)))
    if right - left < 1 or bottom_edge - top_edge < 1:
        raise ValueError("Bounding box is too small")
    return left, top_edge, right, bottom_edge


def extract_text_from_bbox(file_bytes: bytes, page_num: int, bbox: dict[str, float] | Bbox) -> str:
    if page_num < 1:
        raise ValueError("Page number must be >= 1")

    box = normalize_bbox(bbox)
    with pdfplumber.open(io.BytesIO(file_bytes)) as pdf:
        if page_num > len(pdf.pages):
            raise ValueError(f"Page {page_num} not found in PDF")
        page = pdf.pages[page_num - 1]
        cropped = page.crop(box)
        text = normalize_pdf_text(cropped.extract_text() or "")
    return re.sub(r"\s+", " ", text).strip()


def page_global_offset(raw_text_pages: list[dict[str, Any]], page_num: int) -> int:
    offset = 0
    for page in raw_text_pages:
        if page.get("page") == page_num:
            return offset
        offset += len(page.get("text") or "") + len(PAGE_TEXT_SEPARATOR)
    return offset


def _fuzzy_find(haystack: str, needle: str) -> int:
    if not needle:
        return -1
    collapsed_hay = re.sub(r"\s+", " ", haystack)
    collapsed_needle = re.sub(r"\s+", " ", needle).strip()
    return collapsed_hay.find(collapsed_needle)


def find_value_offsets_in_raw_text(
    raw_text: str,
    raw_text_pages: list[dict[str, Any]],
    page_num: int,
    value: str,
) -> tuple[int, int]:
    cleaned = (value or "").strip()
    if not cleaned:
        raise ValueError("Extracted text is empty")

    for page in raw_text_pages:
        if page.get("page") != page_num:
            continue
        page_text = page.get("text") or ""
        base = page_global_offset(raw_text_pages, page_num)
        local = page_text.find(cleaned)
        if local == -1:
            local = _fuzzy_find(page_text, cleaned)
        if local != -1:
            start = base + local
            end = start + len(cleaned)
            return start, end

        near = raw_text.find(cleaned, max(0, base - 20))
        if near != -1:
            return near, near + len(cleaned)

        fuzzy = _fuzzy_find(raw_text[base : base + len(page_text) + 40], cleaned)
        if fuzzy != -1:
            start = base + fuzzy
            return start, start + len(cleaned)

        raise ValueError(f"Could not locate extracted text on page {page_num}")

    raise ValueError(f"Page {page_num} not found in raw text pages")


def resolve_bbox_selection(
    file_bytes: bytes,
    raw_text: str,
    raw_text_pages: list[dict[str, Any]],
    *,
    page: int,
    bbox: dict[str, float],
) -> dict[str, Any]:
    value = extract_text_from_bbox(file_bytes, page, bbox)
    if not value:
        raise ValueError("No text found in the selected area")

    selection_start, selection_end = find_value_offsets_in_raw_text(
        raw_text,
        raw_text_pages,
        page,
        value,
    )
    box = normalize_bbox(bbox)
    return {
        "page": page,
        "value": value,
        "bbox": {"x0": box[0], "top": box[1], "x1": box[2], "bottom": box[3]},
        "selection_start": selection_start,
        "selection_end": selection_end,
    }
