{escape_html_entities(meta['title'])}

#!/usr/bin/env python3
"""Shared helpers for INSPIRE-HEP API interactions.\n
Provides functions for querying the INSPIRE-HEP REST API, fetching BibTeX
entries, extracting metadata from JSON records, and building markdown/HTML
abstract pages.  Handles rate limiting (15 req/5 s) with automatic retry
and exponential backoff on HTTP 429.
"""

import math
import sys
import time

import requests

from bibtex_lib import escape_html_entities, escape_markdown_chars, write_bibtex_to_file

BASE = "https://inspirehep.net/api"
HEADERS_JSON = {"Accept": "application/json"}
DELAY = 0.4  # seconds between requests (15 req / 5 s limit)
MAX_RETRIES = 5
RETRY_BACKOFF = 2.0  # base for exponential backoff in seconds
PAGE_SIZE = 1000  # max allowed by INSPIRE

def log_to_stderr(msg: str) -> None:
    """Print a status message to stderr.\n
    Used for progress and diagnostic output so that stdout remains clean
    for piped BibTeX or other data output.\n
    Args:
        msg: The message to print.
    """
    print(msg, file=sys.stderr)

def http_get_with_retry(
    url: str, params: dict | None = None, headers: dict | None = None
) -> requests.Response:
    """Perform an HTTP GET with automatic retry on rate-limit and transient errors.\n
    Retries up to ``MAX_RETRIES`` times on HTTP 429 (rate limit) or
    ``ConnectionError``, using exponential backoff (base ``RETRY_BACKOFF``).\n
    Args:
        url: The URL to fetch.
        params: Optional query parameters.
        headers: Optional request headers.\n
    Returns:
        The successful ``requests.Response`` object.\n
    Raises:
        requests.HTTPError: If the request fails after all retries (non-429).
        requests.ConnectionError: If a connection error persists after retries.
    """
    for attempt in range(1, MAX_RETRIES + 1):
        try:
            r = requests.get(url, params=params, headers=headers, timeout=60)
            if r.status_code == 429:
                wait = RETRY_BACKOFF**attempt
                log_to_stderr(
                    f"  rate-limited (429), waiting {wait:.1f}s (attempt {attempt}/{MAX_RETRIES})..."
                )
                time.sleep(wait)
                continue
            r.raise_for_status()
            return r
        except requests.ConnectionError:
            if attempt == MAX_RETRIES:
                raise
            wait = RETRY_BACKOFF**attempt
            log_to_stderr(
                f"  connection error, retrying in {wait:.1f}s (attempt {attempt}/{MAX_RETRIES})..."
            )
            time.sleep(wait)
    r.raise_for_status()  # type: ignore[union-attr]
    return r  # unreachable, but keeps type-checkers happy

def resolve_arxiv_to_inspire_id(arxiv_id: str) -> int:
    """Resolve an arXiv ID to an INSPIRE-HEP record ID.\n
    Queries the INSPIRE literature search API with ``arxiv:<arxiv_id>``
    and returns the ``control_number`` of the first hit.\n
    Args:
        arxiv_id: An arXiv paper identifier (e.g., ``2604.20797`` or
            ``hep-ph/0603175``).\n
    Returns:
        The INSPIRE record ID (integer).\n
    Raises:
        SystemExit: If no matching record is found on INSPIRE-HEP.
    """
    r = http_get_with_retry(
        f"{BASE}/literature",
        params={"q": f"arxiv:{arxiv_id}", "fields": "control_number", "size": 1},
        headers=HEADERS_JSON,
    )
    hits = r.json()["hits"]["hits"]
    if not hits:
        sys.exit(f"No INSPIRE record found for arXiv:{arxiv_id}")
    return hits[0]["metadata"]["control_number"]

def count_matching_records(recid: int, query_prefix: str) -> int:
    """Return the total count of records matching an INSPIRE query.\n
    Queries ``<query_prefix>:recid:<recid>`` and returns the hit count.
    Typically used with ``query_prefix`` of ``"citedby"`` (references)
    or ``"refersto"`` (citations).\n
    Args:
        recid: The INSPIRE record ID.
        query_prefix: The search prefix (e.g., ``"citedby"`` or ``"refersto"``).\n
    Returns:
        Total number of matching records.
    """
    r = http_get_with_retry(
        f"{BASE}/literature",
        params={"q": f"{query_prefix}:recid:{recid}", "size": 1},
        headers=HEADERS_JSON,
    )
    return r.json()["hits"]["total"]

def fetch_bibtex_paginated(recid: int, total: int, query_prefix: str) -> list[str]:
    """Fetch all BibTeX entries matching an INSPIRE query, paginating as needed.\n
    Queries ``<query_prefix>:recid:<recid>`` in pages of ``PAGE_SIZE``
    entries, sorted by most recent first.  Inserts a delay between page
    requests to respect INSPIRE's rate limits.\n
    Args:
        recid: The INSPIRE record ID.
        total: Total number of entries to fetch (used to compute page count).
        query_prefix: The search prefix (e.g., ``"citedby"`` or ``"refersto"``).\n
    Returns:
        A list of raw BibTeX entry strings, one per entry.
    """
    pages = max(1, math.ceil(total / PAGE_SIZE))
    entries: list[str] = []
    #
    for page in range(1, pages + 1):
        log_to_stderr(f"  fetching page {page}/{pages}...")
        r = http_get_with_retry(
            f"{BASE}/literature",
            params={
                "q": f"{query_prefix}:recid:{recid}",
                "sort": "mostrecent",
                "size": PAGE_SIZE,
                "page": page,
                "format": "bibtex",
            },
            headers={"Accept": "application/x-bibtex"},
        )
        bib = r.text.strip()
        if bib:
            entries.extend(e.strip() for e in bib.split("\n\n") if e.strip())
        if page < pages:
            time.sleep(DELAY)
    #
    return entries

def fetch_single_record_bibtex(recid: int) -> str:
    """Fetch the BibTeX entry for a single INSPIRE literature record.\n
    Args:
        recid: The INSPIRE record ID.\n
    Returns:
        The raw BibTeX entry string for the paper.
    """
    r = http_get_with_retry(
        f"{BASE}/literature/{recid}", headers={"Accept": "application/x-bibtex"}
    )
    return r.text.strip()

def fetch_inspire_json_record(recid: int) -> dict:
    """Fetch the full JSON metadata for a single INSPIRE literature record.\n
    Args:
        recid: The INSPIRE record ID.\n
    Returns:
        A dict containing the complete INSPIRE JSON record, including
        ``metadata`` with titles, authors, abstracts, eprints, etc.
    """
    r = http_get_with_retry(f"{BASE}/literature/{recid}", headers=HEADERS_JSON)
    return r.json()

def extract_record_metadata(record: dict) -> dict:
    """Extract structured metadata fields from an INSPIRE JSON record.\n
    Pulls out title, authors, abstract, arXiv ID, categories, DOI,
    dates, publication info, and citation count into a flat dict.\n
    Args:
        record: A full INSPIRE JSON record dict (as returned by
            :func:`fetch_record`).\n
    Returns:
        A dict with the following keys:\n
        - ``title`` (str): Paper title.
        - ``authors`` (list[str]): Author full names.
        - ``abstract`` (str): Paper abstract text.
        - ``arxiv_id`` (str | None): Primary arXiv e-print ID.
        - ``categories`` (list[str]): arXiv categories.
        - ``inspire_categories`` (list[str]): INSPIRE subject categories.
        - ``doi`` (str | None): DOI (prefers publication material).
        - ``preprint_date`` (str): Preprint date (``YYYY-MM-DD``).
        - ``earliest_date`` (str): Earliest date on record.
        - ``journal_title`` (str): Journal name.
        - ``journal_volume`` (str): Volume number.
        - ``journal_issue`` (str): Issue number.
        - ``journal_year`` (str): Publication year.
        - ``artid`` (str): Article identifier.
        - ``citation_count`` (int): Number of citations.
    """
    m = record.get("metadata", {})
    #
    # Title: use first available
    titles = m.get("titles", [])
    title = titles[0].get("title", "Untitled") if titles else "Untitled"
    #
    # Authors
    authors = [a.get("full_name", "") for a in m.get("authors", [])]
    #
    # Abstract: use first available
    abstracts = m.get("abstracts", [])
    abstract = abstracts[0].get("value", "") if abstracts else ""
    #
    # ArXiv ID and categories
    arxiv_ids = [e.get("value", "") for e in m.get("arxiv_eprints", [])]
    arxiv_id = arxiv_ids[0] if arxiv_ids else None
    categories = []
    for e in m.get("arxiv_eprints", []):
        categories.extend(e.get("categories", []))
    categories = list(dict.fromkeys(categories))  # dedupe
    #
    # Also include inspire_categories
    inspire_cats = [c.get("term", "") for c in m.get("inspire_categories", [])]
    #
    # DOI (prefer publication material)
    doi = None
    for d in m.get("dois", []):
        if d.get("material") == "publication":
            doi = d.get("value")
            break
    if not doi and m.get("dois"):
        doi = m["dois"][0].get("value")
    #
    # Dates
    preprint_date = m.get("preprint_date", "")
    earliest_date = m.get("earliest_date", "")
    #
    # Publication info
    pubs = m.get("publication_info", [])
    pub = pubs[0] if pubs else {}
    journal_title = pub.get("journal_title", "")
    journal_volume = pub.get("journal_volume", "")
    journal_issue = pub.get("journal_issue", "")
    journal_year = pub.get("year", "")
    artid = pub.get("artid", "")
    #
    # Citation count
    citation_count = m.get("citation_count", 0)
    #
    return {
        "title": title,
        "authors": authors,
        "abstract": abstract,
        "arxiv_id": arxiv_id,
        "categories": categories,
        "inspire_categories": inspire_cats,
        "doi": doi,
        "preprint_date": preprint_date,
        "earliest_date": earliest_date,
        "journal_title": journal_title,
        "journal_volume": journal_volume,
        "journal_issue": journal_issue,
        "journal_year": journal_year,
        "artid": artid,
        "citation_count": citation_count,
    }

def count_papers_by_author(author_id: str) -> int:
    """Count the total number of papers by an author on INSPIRE-HEP.\n
    Queries the literature search API with ``a <author_id>`` and returns
    the total hit count.\n
    Args:
        author_id: INSPIRE author identifier (e.g., ``"E.Witten.1"`` or
            ``"J.Maldacena.1"``).\n
    Returns:
        Total number of papers by the author.
    """
    r = http_get_with_retry(
        f"{BASE}/literature",
        params={"q": f"a {author_id}", "size": 1},
        headers=HEADERS_JSON,
    )
    return r.json()["hits"]["total"]

def fetch_bibtex_by_author(author_id: str) -> list[str]:
    """Fetch all BibTeX entries for papers by an author on INSPIRE-HEP.\n
    Queries the literature search API with ``a <author_id>`` in pages of
    ``PAGE_SIZE`` entries, sorted by most recent first.  Inserts a delay
    between page requests to respect INSPIRE's rate limits.\n
    Args:
        author_id: INSPIRE author identifier (e.g., ``"E.Witten.1"`` or
            ``"J.Maldacena.1"``).\n
    Returns:
        A list of raw BibTeX entry strings, one per paper.
    """
    total = count_papers_by_author(author_id)
    log_to_stderr(f"  {total} papers by author '{author_id}' indexed in INSPIRE")
    #
    if total == 0:
        return []
    #
    pages = max(1, math.ceil(total / PAGE_SIZE))
    entries: list[str] = []
    #
    for page in range(1, pages + 1):
        log_to_stderr(f"  fetching page {page}/{pages}...")
        r = http_get_with_retry(
            f"{BASE}/literature",
            params={
                "q": f"a {author_id}",
                "sort": "mostrecent",
                "size": PAGE_SIZE,
                "page": page,
                "format": "bibtex",
            },
            headers={"Accept": "application/x-bibtex"},
        )
        bib = r.text.strip()
        if bib:
            entries.extend(e.strip() for e in bib.split("\n\n") if e.strip())
        if page < pages:
            time.sleep(DELAY)
    #
    return entries

def fetch_and_write_bibtex_to_file(
    arxiv_id: str, recid: int, query_prefix: str, out_path: str
) -> None:
    """Fetch BibTeX from INSPIRE and write to a file.\n
    Combines :func:`get_count` and :func:`fetch_bibtex` to download all
    entries matching ``<query_prefix>:recid:<recid>`` and save them to
    *out_path*.  If no entries are found, writes an empty file.\n
    Args:
        arxiv_id: The arXiv ID (used for logging only).
        recid: The INSPIRE record ID.
        query_prefix: The search prefix (``"citedby"`` for references,
            ``"refersto"`` for citations).
        out_path: File path to write the BibTeX entries to.
    """
    label = "references" if query_prefix == "citedby" else "citations"
    total = count_matching_records(recid, query_prefix)
    log_to_stderr(f"  {total} {label} indexed in INSPIRE")
    #
    if total == 0:
        write_bibtex_to_file([], out_path)
        return
    #
    entries = fetch_bibtex_paginated(recid, total, query_prefix)
    log_to_stderr(f"  {len(entries)} BibTeX entries retrieved")
    write_bibtex_to_file(entries, out_path)

def fetch_and_write_references_bibtex(arxiv_id: str, recid: int, out_path: str) -> None:
    """Fetch references BibTeX from INSPIRE and write to a file.\n
    Downloads all papers referenced by the given record (``citedby:recid:…``)
    and saves them as BibTeX to *out_path*.\n
    Args:
        arxiv_id: The arXiv ID (used for logging only).
        recid: The INSPIRE record ID.
        out_path: File path to write the BibTeX entries to.
    """
    fetch_and_write_bibtex_to_file(arxiv_id, recid, "citedby", out_path)

def fetch_and_write_citations_bibtex(arxiv_id: str, recid: int, out_path: str) -> None:
    """Fetch citations BibTeX from INSPIRE and write to a file.\n
    Downloads all papers that cite the given record (``refersto:recid:…``)
    and saves them as BibTeX to *out_path*.\n
    Args:
        arxiv_id: The arXiv ID (used for logging only).
        recid: The INSPIRE record ID.
        out_path: File path to write the BibTeX entries to.
    """
    fetch_and_write_bibtex_to_file(arxiv_id, recid, "refersto", out_path)

def fetch_and_write_bibtex_by_author_to_file(author_id: str, out_path: str) -> None:
    """Fetch BibTeX for all papers by an author and write to a file.\n
    Combines :func:`count_papers_by_author` and :func:`fetch_bibtex_by_author`
    to download all BibTeX entries for papers by the specified author and
    save them to *out_path*.  If no papers are found, writes an empty file.\n
    Args:
        author_id: INSPIRE author identifier (e.g., ``"E.Witten.1"`` or
            ``"J.Maldacena.1"``).
        out_path: File path to write the BibTeX entries to.
    """
    entries = fetch_bibtex_by_author(author_id)
    log_to_stderr(f"  {len(entries)} BibTeX entries retrieved")
    write_bibtex_to_file(entries, out_path)

def format_iso_date_to_display(date_str: str) -> str:
    """Format an ISO date string as a human-readable date.\n
    Converts ``YYYY-MM-DD`` to ``DD Month YYYY`` (e.g., ``2026-04-15``
    → ``15 April 2026``).  Leading zeros are stripped from the day.\n
    Args:
        date_str: Date string in ``YYYY-MM-DD`` format, or empty string.\n
    Returns:
        Formatted date string, or the original string if parsing fails,
        or empty string if input is empty.
    """
    from datetime import datetime
    #
    if not date_str:
        return ""
    try:
        dt = datetime.strptime(date_str, "%Y-%m-%d")
        return dt.strftime("%d %B %Y").lstrip("0")
    except ValueError:
        return date_str

def format_categories_markdown(categories: list[str], inspire_cats: list[str]) -> str:
    """Format subject categories as a markdown bulleted list.\n
    Combines arXiv categories and INSPIRE categories, deduplicating
    entries (arXiv categories appear first).\n
    Args:
        categories: arXiv category strings.
        inspire_cats: INSPIRE category strings.\n
    Returns:
        A markdown string with one ``- category`` per line, or
        ``"- (none)"`` if both lists are empty.
    """
    lines = []
    seen = set()
    for cat in categories:
        if cat not in seen:
            seen.add(cat)
            lines.append(f"- {escape_markdown_chars(cat)}")
    for cat in inspire_cats:
        if cat not in seen:
            seen.add(cat)
            lines.append(f"- {escape_markdown_chars(cat)}")
    return "\n".join(lines) if lines else "- (none)"

def format_journal_reference(meta: dict) -> str:
    """Format a journal reference string from metadata fields.\n
    Combines journal title, volume, issue, article ID, and year into a
    single reference string, omitting any fields that are empty.\n
    Args:
        meta: Metadata dict as returned by :func:`extract_metadata`.\n
    Returns:
        A formatted journal reference string (e.g.,
        ``"Phys.Rev.D 105 (2022) 074501"``), or empty string if no
        publication info is available.
    """
    parts = []
    if meta["journal_title"]:
        parts.append(meta["journal_title"])
    if meta["journal_volume"]:
        parts.append(meta["journal_volume"])
    if meta["journal_issue"]:
        parts.append(f"({meta['journal_issue']})")
    if meta["artid"]:
        parts.append(meta["artid"])
    if meta["journal_year"]:
        parts.append(f"({meta['journal_year']})")
    return " ".join(parts)

def build_markdown_abstract_page(recid: int, meta: dict) -> str:
    """Build a markdown abstract page from extracted metadata.\n
    Produces a markdown document with title, INSPIRE/arXiv/DOI links,
    authors, submission date, subject categories, journal reference,
    citation count, and abstract text.\n
    Args:
        recid: The INSPIRE record ID (used to construct the INSPIRE link).
        meta: Metadata dict as returned by :func:`extract_metadata`.\n
    Returns:
        A markdown string suitable for writing to a ``_abstract.md`` file.
    """
    lines = []
    #
    # Title
    lines.append(f"# {escape_markdown_chars(meta['title'])}")
    lines.append("")
    #
    # INSPIRE link
    lines.append(f"**INSPIRE:** [{recid}](https://inspirehep.net/literature/{recid})")
    #
    # ArXiv link (if available)
    if meta["arxiv_id"]:
        lines.append(
            f"**arXiv:** [{escape_markdown_chars(meta['arxiv_id'])}](https://arxiv.org/abs/{meta['arxiv_id']})"
        )
    #
    # DOI (if available)
    if meta["doi"]:
        lines.append(
            f"**DOI:** [{escape_markdown_chars(meta['doi'])}](https://doi.org/{meta['doi']})"
        )
    #
    lines.append("")
    #
    # Authors
    lines.append(
        f"**Authors:** {', '.join(escape_markdown_chars(a) for a in meta['authors'])}"
    )
    lines.append("")
    #
    # Date
    date_str = meta["preprint_date"] or meta["earliest_date"]
    if date_str:
        lines.append(f"**Submitted:** {format_iso_date_to_display(date_str)}")
        lines.append("")
    #
    # Subjects
    lines.append("**Subjects:**")
    lines.append(
        format_categories_markdown(meta["categories"], meta["inspire_categories"])
    )
    lines.append("")
    #
    # Journal reference
    journal_str = format_journal_reference(meta)
    if journal_str:
        lines.append(f"**Journal reference:** {escape_markdown_chars(journal_str)}")
        lines.append("")
    #
    # Citation count
    if meta["citation_count"]:
        lines.append(f"**Citations:** {meta['citation_count']}")
        lines.append("")
    #
    # Abstract
    lines.append("## Abstract")
    lines.append("")
    lines.append(escape_markdown_chars(meta["abstract"]))
    lines.append("")
    #
    return "\n".join(lines)

def format_categories_html_list(categories: list[str], inspire_cats: list[str]) -> str:
    """Format subject categories as an HTML bulleted list.\n
    Combines arXiv categories and INSPIRE categories, deduplicating
    entries (arXiv categories appear first).\n
    Args:
        categories: arXiv category strings.
        inspire_cats: INSPIRE category strings.\n
    Returns:
        An HTML ``<ul>`` string, or ``<ul><li>(none)</li></ul>`` if
        both lists are empty.
    """
    items = []
    seen = set()
    for cat in categories:
        if cat not in seen:
            seen.add(cat)
            items.append(f"<li>{escape_html_entities(cat)}</li>")
    for cat in inspire_cats:
        if cat not in seen:
            seen.add(cat)
            items.append(f"<li>{escape_html_entities(cat)}</li>")
    if not items:
        return "<ul><li>(none)</li></ul>"
    return "<ul>\n" + "\n".join(items) + "\n</ul>"

def build_html_abstract_page(recid: int, meta: dict) -> str:
    """Build an HTML abstract page from extracted metadata.\n
    Produces an ``<article>`` element with title, INSPIRE/arXiv/DOI links,
    authors, submission date, subject categories, journal reference,
    citation count, and abstract text.\n
    Args:
        recid: The INSPIRE record ID (used to construct the INSPIRE link).
        meta: Metadata dict as returned by :func:`extract_metadata`.\n
    Returns:
        An HTML string (fragment) suitable for wrapping with
        :func:`arxiv_lib.wrap_html_document`.
    """
    lines = []
    lines.append("<article>")
    #
    # Title
    lines.append(f"<h1>{escape_html_entities(meta['title'])}</h1>")
    #
    # Links
    lines.append("<p>")
    lines.append(
        f"<strong>INSPIRE:</strong> "
        f'<a href="https://inspirehep.net/literature/{recid}">{recid}</a>'
    )
    if meta["arxiv_id"]:
        lines.append(
            f"<br><strong>arXiv:</strong> "
            f'<a href="https://arxiv.org/abs/{meta["arxiv_id"]}">{escape_html_entities(meta["arxiv_id"])}</a>'
        )
    if meta["doi"]:
        lines.append(
            f"<br><strong>DOI:</strong> "
            f'<a href="https://doi.org/{meta["doi"]}">{escape_html_entities(meta["doi"])}</a>'
        )
    lines.append("</p>")
    #
    # Authors
    lines.append(
        f"<p><strong>Authors:</strong> {', '.join(escape_html_entities(a) for a in meta['authors'])}</p>"
    )
    #
    # Date
    date_str = meta["preprint_date"] or meta["earliest_date"]
    if date_str:
        lines.append(
            f"<p><strong>Submitted:</strong> {format_iso_date_to_display(date_str)}</p>"
        )
    #
    # Subjects
    lines.append("<p><strong>Subjects:</strong></p>")
    lines.append(
        format_categories_html_list(meta["categories"], meta["inspire_categories"])
    )
    #
    # Journal reference
    journal_str = format_journal_reference(meta)
    if journal_str:
        lines.append(
            f"<p><strong>Journal reference:</strong> {escape_html_entities(journal_str)}</p>"
        )
    #
    # Citation count
    if meta["citation_count"]:
        lines.append(f"<p><strong>Citations:</strong> {meta['citation_count']}</p>")
    #
    # Abstract
    lines.append("<h2>Abstract</h2>")
    lines.append(f"<p>{escape_html_entities(meta['abstract'])}</p>")
    #
    lines.append("</article>")
    return "\n".join(lines) + "\n"