"""Utilities for working with arXiv paper identifiers.\n
Provides functions for parsing arXiv IDs (both new-style ``YYMM.NNNNN`` and
old-style ``category/YYMMNNN`` formats), converting them to filesystem paths,
and wrapping/unwrapping HTML document fragments.
"""

import os
import re

BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))

def parse_arxiv_id_components(arxiv_id: str) -> tuple[int, int, int]:
    """Extract (year, month, sequence index) from an arXiv ID.\n
    Handles both new-style and old-style arXiv identifiers:\n
    - New-style: ``YYMM.NNNNN`` (e.g., ``2604.20797`` → ``(2026, 4, 20797)``)
    - Old-style: ``category/YYMMNNN`` (e.g., ``hep-lat/9810026`` → ``(1998, 10, 26)``)\n
    Two-digit years are converted to four digits: ≥91 maps to 1900s,
    <91 maps to 2000s.\n
    Args:
        arxiv_id: An arXiv paper identifier in either new-style or old-style
            format.\n
    Returns:
        A tuple of ``(year, month, index)`` where *year* is four digits,
        *month* is 1–12, and *index* is the trailing sequence number.\n
    Raises:
        ValueError: If *arxiv_id* does not match either format.
    """
    # New-style: YYMM.nnnnn
    m = re.match(r"^(\d{2})(\d{2})\.(\d+)", arxiv_id)
    if m:
        yy, mm, idx = int(m.group(1)), int(m.group(2)), int(m.group(3))
        yyyy = 1900 + yy if yy >= 91 else 2000 + yy
        return (yyyy, mm, idx)
    # Old-style: category/YYMMNNN
    m = re.match(r"^[a-z\-]+/(\d{2})(\d{2})(\d+)", arxiv_id)
    if m:
        yy, mm, idx = int(m.group(1)), int(m.group(2)), int(m.group(3))
        yyyy = 1900 + yy if yy >= 91 else 2000 + yy
        return (yyyy, mm, idx)
    raise ValueError(f"Unrecognised arXiv ID: {arxiv_id!r}")

def arxiv_id_sort_key(arxiv_id: str) -> tuple[int, int, int]:
    """Return a date-based sort key from an arXiv ID string.\n
    Wraps :func:`parse_arxiv_id_components` with a fallback of ``(0, 0, 0)``
    for unrecognised IDs, making it safe to use directly as a ``key``
    function for :func:`sorted` / ``list.sort``.\n
    Args:
        arxiv_id: An arXiv paper identifier.\n
    Returns:
        A ``(year, month, index)`` tuple, or ``(0, 0, 0)`` if the ID
        cannot be parsed.
    """
    try:
        return parse_arxiv_id_components(arxiv_id)
    except ValueError:
        return (0, 0, 0)

def arxiv_id_to_directory_path(arxiv_id: str) -> str:
    """Return the ``YYYY/ARXIV_ID/`` subfolder path for a given arXiv ID.\n
    The folder is relative to the repository root (``BASE_DIR``).  For
    old-style IDs the slash in the ID is replaced with an underscore in
    the folder name (e.g., ``hep-lat/9810026`` → ``1998/hep-lat_9810026/``).\n
    Args:
        arxiv_id: An arXiv paper identifier.\n
    Returns:
        Absolute path to the paper's folder, e.g.
        ``/repo/2026/2604.20797``.\n
    Raises:
        ValueError: If *arxiv_id* cannot be parsed.
    """
    try:
        year, _month, _idx = parse_arxiv_id_components(arxiv_id)
    except ValueError:
        raise ValueError(f"Cannot parse arXiv ID: {arxiv_id}")
    return os.path.join(BASE_DIR, f"{year:04d}", arxiv_id_to_safe_filename(arxiv_id))

def arxiv_id_to_safe_filename(arxiv_id: str) -> str:
    """Convert an arXiv ID to a filesystem-safe base name.\n
    Replaces ``/`` with ``_`` so that old-style IDs like ``hep-ph/0603175``
    can be used as file name prefixes (e.g., ``hep-ph_0603175``).\n
    Args:
        arxiv_id: An arXiv paper identifier.\n
    Returns:
        A string safe for use as a filename component.
    """
    return arxiv_id.replace("/", "_")

def wrap_in_html_document(body: str, title: str = "") -> str:
    """Wrap an HTML fragment in a complete HTML5 document.\n
    Adds ``<!DOCTYPE html>``, ``<html>``, ``<head>`` (with UTF-8 charset
    and optional ``<title>``), and ``<body>`` tags around the given content.\n
    Args:
        body: Raw HTML content to place inside ``<body>``.
        title: Optional document title.  If non-empty, a ``<title>`` tag
            is included in ``<head>``.\n
    Returns:
        A complete HTML5 document string.
    """
    title_html = f"<title>{title}</title>" if title else ""
    return (
        "<!DOCTYPE html>\n"
        "<html>\n"
        "<head>\n"
        '<meta charset="utf-8">\n'
        f"{title_html}\n"
        "</head>\n"
        "<body>\n"
        f"{body}\n"
        "</body>\n"
        "</html>\n"
    )

def unwrap_in_html_document(html_str: str) -> str:
    """Extract the body content from a wrapped HTML document.\n
    If the input is a full HTML document with ``<body>...</body>`` tags,
    returns only the content between them (stripped of leading/trailing
    whitespace).  If no ``<body>`` tag is found, returns the entire input
    stripped of whitespace.\n
    Args:
        html_str: An HTML string, either a full document or a fragment.\n
    Returns:
        The body content, with surrounding whitespace removed.
    """
    m = re.search(r"<body>\n(.*?)\n</body>", html_str, re.DOTALL)
    if m:
        return m.group(1).strip()
    return html_str.strip()
