#!/usr/bin/env python3
"""Shared action functions for arXiv paper management scripts.\n
Consolidates process-paper / download / merge logic that was previously
duplicated across individual CLI scripts.  Each public function is the
"do the work" body extracted from a script; the scripts themselves now
import from here and keep only their ``main()`` / argparse glue.\n
Functions are grouped into sections:\n
- **Discovery helpers** -- finding downloaded papers on disk.
- **Per-paper download actions** -- fetching PDF, abstract, BibTeX,
  references, and citations for individual papers.
- **Refresh citations** -- re-downloading citation data when counts change.
- **Merge all info** -- combining per-paper files into repository-wide
  summary files.
- **BibTeX utilities** -- extracting arXiv IDs from .bib files.
"""

import glob
import json
import os
import re
import urllib.request

from arxiv_lib import (
    BASE_DIR,
    arxiv_id_to_directory_path,
    arxiv_id_to_safe_filename,
    arxiv_id_sort_key,
    wrap_in_html_document,
)
from bibtex_lib import (
    convert_entries_to_html_document,
    convert_entries_to_markdown_list,
    merge_bib_files_and_write,
    merge_bibtex_strings,
    parse_bibtex_to_entries,
    strip_outer_braces,
    write_bibtex_to_file,
)
from inspire_lib import (
    build_html_abstract_page,
    build_markdown_abstract_page,
    count_matching_records,
    extract_record_metadata,
    fetch_and_write_bibtex_by_author_to_file,
    fetch_and_write_citations_bibtex,
    fetch_and_write_references_bibtex,
    fetch_inspire_json_record,
    fetch_single_record_bibtex,
    log_to_stderr,
    resolve_arxiv_to_inspire_id,
)

# ============================================================================
# Discovery helpers
# ============================================================================

def prepare_paper_paths(arxiv_id: str) -> tuple[str, str]:
    """Return ``(safe_filename, folder_path)``, creating the folder if needed."""
    base = arxiv_id_to_safe_filename(arxiv_id)
    folder = arxiv_id_to_directory_path(arxiv_id)
    os.makedirs(folder, exist_ok=True)
    return base, folder

def scan_for_downloaded_papers(root_dir: str | None = None) -> list[str]:
    """Return a sorted list of arXiv IDs for all downloaded papers.\n
    Scans recursively for ``*.pdf`` files under *root_dir*, extracts
    arXiv IDs from filenames (new-style ``YYMM.NNNNN`` or old-style
    ``category_YYMMNNN``), and sorts them by date (newest first).\n
    Args:
        root_dir: Root directory to scan.  Defaults to the repository
            root (parent of the ``scripts/`` directory).\n
    Returns:
        A list of arXiv ID strings sorted chronologically (newest first).
    """
    if root_dir is None:
        root_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
    ids: list[str] = []
    for pdf in glob.glob(os.path.join(root_dir, "**", "*.pdf"), recursive=True):
        name = os.path.basename(pdf)
        stem = name[:-4]
        if re.match(r"^\d{4}\.\d{4,5}$", stem):
            ids.append(stem)
        elif "_" in stem:
            parts = stem.split("_", 1)
            ids.append(f"{parts[0]}/{parts[1]}")
    ids.sort(key=arxiv_id_sort_key, reverse=True)
    return ids

def count_entries_in_bib_file(bib_path: str) -> int | None:
    """Return the number of entries in an existing BibTeX file.\n
    Args:
        bib_path: Path to a .bib file.\n
    Returns:
        The number of BibTeX entries in the file, or ``None`` if the
        file does not exist.
    """
    if not os.path.exists(bib_path):
        return None
    with open(bib_path) as f:
        entries = parse_bibtex_to_entries(f.read())
    return len(entries)

# ============================================================================
# Per-paper download actions
# ============================================================================

def resolve_recid(arxiv_id: str, recid: int | None = None) -> int:
    """Resolve an arXiv ID to an INSPIRE record ID, or return *recid* if given."""
    if recid is not None:
        return recid
    log_to_stderr(f"Looking up arXiv:{arxiv_id} on INSPIRE...")
    recid = resolve_arxiv_to_inspire_id(arxiv_id)
    log_to_stderr(f"  recid = {recid}")
    return recid

def convert_bib_to_lists(bib_path: str, md_path: str, html_path: str) -> None:
    """Convert a BibTeX file to numbered markdown and HTML lists.\n
    Args:
        bib_path: Path to the input .bib file.
        md_path: Path to write the markdown output.
        html_path: Path to write the HTML output.
    """
    convert_entries_to_markdown_list(bib_path, md_path)
    convert_entries_to_html_document(bib_path, html_path)

def convert_references_to_lists(arxiv_id: str) -> None:
    """Convert a downloaded references BibTeX file to markdown and HTML lists.\n
    Args:
        arxiv_id: An arXiv paper identifier.
    """
    base, folder = prepare_paper_paths(arxiv_id)
    refs_bib = os.path.join(folder, f"{base}_references.bib")
    refs_md = os.path.join(folder, f"{base}_references.md")
    refs_html = os.path.join(folder, f"{base}_references.html")
    convert_bib_to_lists(refs_bib, refs_md, refs_html)

def convert_citations_to_lists(arxiv_id: str) -> None:
    """Convert a downloaded citations BibTeX file to markdown and HTML lists.\n
    Args:
        arxiv_id: An arXiv paper identifier.
    """
    base, folder = prepare_paper_paths(arxiv_id)
    cites_bib = os.path.join(folder, f"{base}_citations.bib")
    cites_md = os.path.join(folder, f"{base}_citations.md")
    cites_html = os.path.join(folder, f"{base}_citations.html")
    convert_bib_to_lists(cites_bib, cites_md, cites_html)

def convert_abstract_to_pages(
    recid: int, meta: dict, md_path: str, html_path: str, title: str = ""
) -> None:
    """Convert abstract metadata to markdown and HTML pages.\n
    Args:
        recid: INSPIRE record ID.
        meta: Metadata dict from :func:`extract_record_metadata`.
        md_path: Path to write the markdown abstract.
        html_path: Path to write the HTML abstract.
        title: Page title for the HTML document.
    """
    with open(md_path, "w") as f:
        f.write(build_markdown_abstract_page(recid, meta))
    with open(html_path, "w") as f:
        f.write(
            wrap_in_html_document(build_html_abstract_page(recid, meta), title=title)
        )

def download_paper_pdf_from_arxiv(arxiv_id: str) -> None:
    """Download the PDF for a single arXiv paper.\n
    Fetches the PDF from ``https://arxiv.org/pdf/<arxiv_id>`` and saves
    it to ``YYYY/ARXIV_ID/<safe_id>.pdf``.  Creates the output directory
    if it does not exist.\n
    Args:
        arxiv_id: An arXiv paper identifier (e.g., ``2604.20797`` or
            ``hep-ph/0603175``).
    """
    base, folder = prepare_paper_paths(arxiv_id)
    pdf_path = os.path.join(folder, f"{base}.pdf")
    url = f"https://arxiv.org/pdf/{arxiv_id}"
    log_to_stderr(f"Downloading PDF from {url} ...")
    urllib.request.urlretrieve(url, pdf_path)
    log_to_stderr(f"  Saved to {pdf_path}")

def download_abstract_json(arxiv_id: str, recid: int | None = None) -> tuple[dict, int]:
    """Download abstract metadata JSON from INSPIRE-HEP for a single arXiv paper.\n
    Resolves the arXiv ID to an INSPIRE record, fetches full JSON
    metadata, and saves it to ``YYYY/ARXIV_ID/<id>.json``.\n
    Args:
        arxiv_id: An arXiv paper identifier.
        recid: Optional pre-resolved INSPIRE record ID.  If ``None``,
            the arXiv ID will be resolved via the INSPIRE API.\n
    Returns:
        A ``(record, recid)`` tuple where *record* is the raw INSPIRE
        JSON record dict and *recid* is the resolved INSPIRE record ID.
    """
    base, folder = prepare_paper_paths(arxiv_id)
    recid = resolve_recid(arxiv_id, recid)
    json_path = os.path.join(folder, f"{base}.json")
    #
    log_to_stderr("Fetching abstract metadata...")
    record = fetch_inspire_json_record(recid)
    with open(json_path, "w") as f:
        json.dump(record, f, indent=2, ensure_ascii=False)
    log_to_stderr(f"  Saved JSON to {json_path}")
    return record, recid

def download_and_save_abstract(arxiv_id: str, recid: int | None = None) -> None:
    """Download abstract metadata from INSPIRE-HEP for a single arXiv paper.\n
    Resolves the arXiv ID to an INSPIRE record, fetches full JSON
    metadata, and writes three files to ``YYYY/ARXIV_ID/``:\n
    - ``<id>_abstract.md`` — Markdown abstract page.
    - ``<id>_abstract.html`` — HTML abstract page.
    - ``<id>.json`` — Raw INSPIRE JSON record.\n
    Args:
        arxiv_id: An arXiv paper identifier.
        recid: Optional pre-resolved INSPIRE record ID.  If ``None``,
            the arXiv ID will be resolved via the INSPIRE API.
    """
    base, folder = prepare_paper_paths(arxiv_id)
    record, recid = download_abstract_json(arxiv_id, recid)
    #
    meta = extract_record_metadata(record)
    log_to_stderr(f"  Title: {meta['title']}")
    log_to_stderr(
        f"  Authors: {', '.join(meta['authors'][:3])}"
        f"{'...' if len(meta['authors']) > 3 else ''}"
    )
    #
    abstract_path = os.path.join(folder, f"{base}_abstract.md")
    abstract_html_path = os.path.join(folder, f"{base}_abstract.html")
    convert_abstract_to_pages(
        recid, meta, abstract_path, abstract_html_path, meta["title"]
    )
    log_to_stderr(f"  Saved to {abstract_path}")
    log_to_stderr(f"  Saved to {abstract_html_path}")

def download_and_save_bibtex(arxiv_id: str, recid: int | None = None) -> None:
    """Download the BibTeX entry for a single arXiv paper from INSPIRE-HEP.\n
    Resolves the arXiv ID to an INSPIRE record and saves its BibTeX to
    ``YYYY/ARXIV_ID/<id>_bib.bib``.\n
    Args:
        arxiv_id: An arXiv paper identifier.
        recid: Optional pre-resolved INSPIRE record ID.  If ``None``,
            the arXiv ID will be resolved via the INSPIRE API.
    """
    base, folder = prepare_paper_paths(arxiv_id)
    recid = resolve_recid(arxiv_id, recid)
    paper_bib = os.path.join(folder, f"{base}_bib.bib")
    #
    log_to_stderr("Fetching paper BibTeX...")
    bib = fetch_single_record_bibtex(recid)
    write_bibtex_to_file([bib], paper_bib)
    log_to_stderr(f"  Saved to {paper_bib}")

def download_references_bibtex(arxiv_id: str, recid: int | None = None) -> str:
    """Download references BibTeX for a single arXiv paper from INSPIRE-HEP.\n
    Args:
        arxiv_id: An arXiv paper identifier.
        recid: Optional pre-resolved INSPIRE record ID.\n
    Returns:
        Path to the saved ``_references.bib`` file.
    """
    base, folder = prepare_paper_paths(arxiv_id)
    recid = resolve_recid(arxiv_id, recid)
    refs_bib = os.path.join(folder, f"{base}_references.bib")
    #
    log_to_stderr("Fetching references...")
    fetch_and_write_references_bibtex(arxiv_id, recid, refs_bib)
    log_to_stderr(f"  Saved to {refs_bib}")
    return refs_bib

def download_and_save_references(arxiv_id: str, recid: int | None = None) -> None:
    """Download references for a single arXiv paper from INSPIRE-HEP.\n
    Fetches all papers cited by the given paper (``citedby:recid:…``)
    and saves them to ``YYYY/ARXIV_ID/`` as:\n
    - ``<id>_references.bib`` — BibTeX entries.
    - ``<id>_references.md`` — Numbered markdown list.
    - ``<id>_references.html`` — HTML ordered list.\n
    Args:
        arxiv_id: An arXiv paper identifier.
        recid: Optional pre-resolved INSPIRE record ID.  If ``None``,
            the arXiv ID will be resolved via the INSPIRE API.
    """
    download_references_bibtex(arxiv_id, recid)
    convert_references_to_lists(arxiv_id)

def download_citations_bibtex(arxiv_id: str, recid: int | None = None) -> str:
    """Download citations BibTeX for a single arXiv paper from INSPIRE-HEP.\n
    Args:
        arxiv_id: An arXiv paper identifier.
        recid: Optional pre-resolved INSPIRE record ID.\n
    Returns:
        Path to the saved ``_citations.bib`` file.
    """
    base, folder = prepare_paper_paths(arxiv_id)
    recid = resolve_recid(arxiv_id, recid)
    cites_bib = os.path.join(folder, f"{base}_citations.bib")
    #
    log_to_stderr("Fetching citations...")
    fetch_and_write_citations_bibtex(arxiv_id, recid, cites_bib)
    log_to_stderr(f"  Saved to {cites_bib}")
    return cites_bib

def download_and_save_citations(arxiv_id: str, recid: int | None = None) -> None:
    """Download citations for a single arXiv paper from INSPIRE-HEP.\n
    Fetches all papers that cite the given paper (``refersto:recid:…``)
    and saves them to ``YYYY/ARXIV_ID/`` as:\n
    - ``<id>_citations.bib`` — BibTeX entries.
    - ``<id>_citations.md`` — Numbered markdown list.
    - ``<id>_citations.html`` — HTML ordered list.\n
    Args:
        arxiv_id: An arXiv paper identifier.
        recid: Optional pre-resolved INSPIRE record ID.  If ``None``,
            the arXiv ID will be resolved via the INSPIRE API.
    """
    download_citations_bibtex(arxiv_id, recid)
    convert_citations_to_lists(arxiv_id)

def author_id_to_filename(author_id: str) -> str:
    """Convert an INSPIRE author ID to a safe filename stem.\n
    Replaces dots with underscores so that ``E.Witten.1`` becomes
    ``E_Witten_1``.\n
    Args:
        author_id: INSPIRE author identifier.\n
    Returns:
        A filename-safe string (no extension).
    """
    return author_id.replace(".", "_")

def regenerate_author_lists(author_id: str) -> None:
    """Regenerate .md and .html lists from an existing author .bib file.\n
    Args:
        author_id: INSPIRE author identifier (e.g. ``"E.Witten.1"``).
    """
    stem = author_id_to_filename(author_id)
    authors_dir = os.path.join(BASE_DIR, "authors")
    bib_path = os.path.join(authors_dir, f"{stem}.bib")
    md_path = os.path.join(authors_dir, f"{stem}.md")
    html_path = os.path.join(authors_dir, f"{stem}.html")
    convert_entries_to_markdown_list(bib_path, md_path)
    convert_entries_to_html_document(bib_path, html_path)
    log_to_stderr(f"  Saved to {md_path}")
    log_to_stderr(f"  Saved to {html_path}")

def download_author_bib(author_id: str) -> None:
    """Download BibTeX for all papers by a single author.\n
    Writes three files to ``authors/``:\n
    - ``<name>.bib``  — BibTeX entries.
    - ``<name>.md``   — Numbered markdown list.
    - ``<name>.html`` — HTML ordered list.\n
    Args:
        author_id: INSPIRE author identifier (e.g. ``"E.Witten.1"``).
    """
    stem = author_id_to_filename(author_id)
    authors_dir = os.path.join(BASE_DIR, "authors")
    os.makedirs(authors_dir, exist_ok=True)
    bib_path = os.path.join(authors_dir, f"{stem}.bib")
    #
    log_to_stderr(f"=== {author_id} ===")
    fetch_and_write_bibtex_by_author_to_file(author_id, bib_path)
    log_to_stderr(f"  Saved to {bib_path}")
    #
    regenerate_author_lists(author_id)

def sync_author_bib(author_id: str, force: bool = False) -> bool:
    """Refresh BibTeX for a single author from INSPIRE-HEP.\n
    Compares the local entry count (from the existing ``authors/<name>.bib``
    file) against the paper count reported by INSPIRE.  Re-downloads and
    regenerates the ``.bib``, ``.md``, and ``.html`` files only when the
    counts differ or *force* is ``True``.\n
    Args:
        author_id: INSPIRE author identifier (e.g. ``"E.Witten.1"``).
        force: If ``True``, re-download regardless of count comparison.\n
    Returns:
        ``True`` if the author bib was updated, ``False`` if already
        up-to-date.
    """
    from inspire_lib import count_papers_by_author
    #
    stem = author_id_to_filename(author_id)
    authors_dir = os.path.join(BASE_DIR, "authors")
    bib_path = os.path.join(authors_dir, f"{stem}.bib")
    #
    old_count = count_entries_in_bib_file(bib_path)
    if old_count is None:
        log_to_stderr(f"  No existing bib file for {author_id}, downloading...")
    #
    total = count_papers_by_author(author_id)
    log_to_stderr(f"  INSPIRE has {total} papers, local file has {old_count}")
    #
    if not force and total == old_count:
        log_to_stderr("  Bib up-to-date, regenerating .md and .html")
        regenerate_author_lists(author_id)
        return False
    #
    download_author_bib(author_id)
    return True

def sync_all_author_bibs(force: bool = False) -> tuple[int, int]:
    """Refresh BibTeX for all authors listed in ``authors.txt``.\n
    Reads author IDs from the ``authors.txt`` file in the repository root
    and calls :func:`sync_author_bib` for each.\n
    Args:
        force: If ``True``, re-download even if the paper count has not changed.\n
    Returns:
        An ``(updated, total)`` tuple where *updated* is the number of
        authors whose bibs were refreshed and *total* is the number of
        authors processed.
    """
    authors_file = os.path.join(BASE_DIR, "authors.txt")
    with open(authors_file) as f:
        author_ids = [line.strip() for line in f if line.strip()]
    log_to_stderr(f"Found {len(author_ids)} authors in {authors_file}")
    #
    updated = 0
    for author_id in author_ids:
        log_to_stderr(f"\n=== {author_id} ===")
        if sync_author_bib(author_id, force):
            updated += 1
    #
    log_to_stderr(f"\nDone. Updated bibs for {updated}/{len(author_ids)} authors.")
    return (updated, len(author_ids))

def download_all_paper_data(arxiv_id: str, force: bool = False) -> None:
    """Download all information for a single arXiv paper.\n
    Orchestrates the full download pipeline, writing the following files
    to ``YYYY/ARXIV_ID/``:\n
    1. ``<id>.pdf`` — PDF from arXiv.
    2. ``<id>.json`` — Raw INSPIRE JSON record.
    3. ``<id>_abstract.md`` / ``<id>_abstract.html`` — Abstract pages.
    4. ``<id>_bib.bib`` — Paper BibTeX.
    5. ``<id>_references.bib/.md/.html`` — References.
    6. ``<id>_citations.bib/.md/.html`` — Citations.\n
    Existing files are skipped unless *force* is ``True``.  Uses cached
    JSON to avoid redundant INSPIRE lookups when not forcing.\n
    Args:
        arxiv_id: An arXiv paper identifier.
        force: If ``True``, re-download all files even if they exist.
    """
    base, folder = prepare_paper_paths(arxiv_id)
    #
    # 1. Download PDF
    pdf_path = os.path.join(folder, f"{base}.pdf")
    if force or not os.path.exists(pdf_path):
        try:
            download_paper_pdf_from_arxiv(arxiv_id)
        except Exception as e:
            log_to_stderr(f"  Failed to download PDF for {arxiv_id}: {e}")
            log_to_stderr("  Continuing with remaining steps...")
    else:
        log_to_stderr(f"PDF already exists at {pdf_path}, skipping download")
    #
    # Resolve INSPIRE recid (use cache when available)
    json_path = os.path.join(folder, f"{base}.json")
    recid: int | None = None
    if not force and os.path.exists(json_path):
        with open(json_path) as f:
            cached_record = json.load(f)
        recid = cached_record.get("id") or cached_record["metadata"]["control_number"]
        log_to_stderr(f"Using cached recid = {recid} from {json_path}")
    else:
        log_to_stderr(f"Looking up arXiv:{arxiv_id} on INSPIRE...")
        recid = resolve_arxiv_to_inspire_id(arxiv_id)
        log_to_stderr(f"  recid = {recid}")
    #
    # 2. Fetch abstract metadata
    if force or not os.path.exists(json_path):
        try:
            download_and_save_abstract(arxiv_id, recid)
        except Exception:
            log_to_stderr(f"  Failed to fetch abstract metadata for {arxiv_id}")
            raise
    else:
        log_to_stderr(f"Abstract JSON already exists at {json_path}, skipping download")
    #
    # 3. Fetch paper BibTeX
    paper_bib = os.path.join(folder, f"{base}_bib.bib")
    if force or not os.path.exists(paper_bib):
        try:
            download_and_save_bibtex(arxiv_id, recid)
        except Exception:
            log_to_stderr(f"  Failed to fetch paper BibTeX for {arxiv_id}")
            raise
    else:
        log_to_stderr(f"Paper BibTeX already exists at {paper_bib}, skipping download")
    #
    # 4. Fetch references
    refs_bib = os.path.join(folder, f"{base}_references.bib")
    if force or not os.path.exists(refs_bib):
        try:
            download_and_save_references(arxiv_id, recid)
        except Exception:
            log_to_stderr(f"  Failed to fetch references for {arxiv_id}")
            raise
    else:
        log_to_stderr(f"References already exist at {refs_bib}, skipping download")
        convert_references_to_lists(arxiv_id)
    #
    # 5. Fetch citations
    cites_bib = os.path.join(folder, f"{base}_citations.bib")
    if force or not os.path.exists(cites_bib):
        try:
            download_and_save_citations(arxiv_id, recid)
        except Exception:
            log_to_stderr(f"  Failed to fetch citations for {arxiv_id}")
            raise
    else:
        log_to_stderr(f"Citations already exist at {cites_bib}, skipping download")
        convert_citations_to_lists(arxiv_id)
    #
    log_to_stderr(f"Done processing {arxiv_id}.")

# ============================================================================
# Refresh citations
# ============================================================================

def sync_citations_for_paper(arxiv_id: str, force: bool = False) -> bool:
    """Refresh citations for one paper from INSPIRE-HEP.\n
    Compares the local citation count (from the existing ``_citations.bib``
    file) against the count reported by INSPIRE.  Re-downloads and
    regenerates the ``.bib``, ``.md``, and ``.html`` files only when the
    counts differ or *force* is ``True``.\n
    Args:
        arxiv_id: An arXiv paper identifier.
        force: If ``True``, re-download regardless of count comparison.\n
    Returns:
        ``True`` if citations were updated, ``False`` if already
        up-to-date.
    """
    base, folder = prepare_paper_paths(arxiv_id)
    cites_bib = os.path.join(folder, f"{base}_citations.bib")
    #
    if not os.path.exists(cites_bib):
        log_to_stderr(f"  No existing citations file for {arxiv_id}, downloading...")
    #
    log_to_stderr(f"Looking up arXiv:{arxiv_id} on INSPIRE...")
    recid = resolve_arxiv_to_inspire_id(arxiv_id)
    log_to_stderr(f"  recid = {recid}")
    #
    total = count_matching_records(recid, "refersto")
    old_count = count_entries_in_bib_file(cites_bib)
    log_to_stderr(f"  INSPIRE has {total} citations, local file has {old_count}")
    #
    if not force and total == old_count:
        log_to_stderr("  Up-to-date, skipping")
        return False
    #
    download_citations_bibtex(arxiv_id, recid)
    convert_citations_to_lists(arxiv_id)
    log_to_stderr(f"  Saved to {cites_bib}")
    return True

def sync_all_paper_citations(force: bool = False) -> tuple[int, int]:
    """Refresh citations for all downloaded papers.\n
    Iterates over all papers found by :func:`scan_for_downloaded_papers`
    and calls :func:`sync_citations_for_paper` for each.\n
    Args:
        force: If ``True``, re-download citations even if counts match.\n
    Returns:
        A ``(updated, total)`` tuple where *updated* is the number of
        papers whose citations were refreshed and *total* is the number
        of papers processed.
    """
    ids = scan_for_downloaded_papers()
    log_to_stderr(f"Found {len(ids)} downloaded papers")
    #
    updated = 0
    for arxiv_id in ids:
        log_to_stderr(f"\n--- {arxiv_id} ---")
        if sync_citations_for_paper(arxiv_id, force):
            updated += 1
    #
    log_to_stderr(f"\nDone. Updated citations for {updated}/{len(ids)} papers.")
    return (updated, len(ids))

# ============================================================================
# Merge all info
# ============================================================================

def extract_date_sort_key(path: str) -> tuple[int, int, int]:
    """Return a date-based sort key from a JSON file's parent directory.\n
    Extracts the arXiv ID from the directory name (either new-style
    ``YYYY/YYMM.NNNNN/`` or old-style ``YYYY/category_YYMMNNN/``) and
    converts it to ``(year, month, idx)`` via :func:`arxiv_id_sort_key`.\n
    Args:
        path: Absolute path to a ``.json`` file inside a
            ``YYYY/ARXIV_ID/`` directory.\n
    Returns:
        A ``(year, month, idx)`` tuple.  Returns ``(0, 0, 0)`` if the
        directory name cannot be parsed.
    """
    parent = os.path.basename(os.path.dirname(path))
    # New-style dir: 2604.20797
    if re.match(r"^\d{4}\.\d+$", parent):
        return arxiv_id_sort_key(parent)
    # Old-style dir: hep-lat_9810026
    m = re.match(r"^([a-z\-]+)_(\d+)$", parent)
    if m:
        return arxiv_id_sort_key(f"{m.group(1)}/{m.group(2)}")
    return (0, 0, 0)

def collect_bib_files_by_pattern(
    pattern: str, base_dir: str | None = None
) -> list[str]:
    """Return sorted file paths matching a glob pattern under the base directory.\n
    Args:
        pattern: A glob pattern relative to *base_dir* (e.g.,
            ``"YYYY/*/*_references.bib"``).
        base_dir: Root directory to search.  Defaults to the repository
            root.\n
    Returns:
        A sorted list of matching absolute file paths.
    """
    if base_dir is None:
        base_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
    files = glob.glob(os.path.join(base_dir, pattern))
    files.sort()
    return files

def combine_all_abstracts(base_dir: str | None = None) -> None:
    """Merge all per-paper abstracts into combined summary files.\n
    Scans ``YYYY/*/*.json`` for INSPIRE JSON records, sorts them by date
    (most recent first), and writes:\n
    - ``all_abstracts.md`` — All abstracts separated by horizontal rules.
    - ``all_abstracts.html`` — All abstracts in a single HTML document.\n
    Args:
        base_dir: Root directory to scan.  Defaults to the repository root.
    """
    if base_dir is None:
        base_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
    output = os.path.join(base_dir, "all_abstracts.md")
    output_html = os.path.join(base_dir, "all_abstracts.html")
    json_files = glob.glob(
        os.path.join(base_dir, "[0-9][0-9][0-9][0-9]", "*", "*.json")
    )
    json_files_sorted = sorted(json_files, key=extract_date_sort_key, reverse=True)
    for path in json_files_sorted:
        y, m, _idx = extract_date_sort_key(path)
        log_to_stderr(f"{y:04d}-{m:02d}  {os.path.basename(path)}")
    md_parts: list[str] = []
    html_parts: list[str] = []
    for path in json_files_sorted:
        with open(path) as f:
            record = json.load(f)
        recid = record.get("metadata", {}).get("control_number", 0)
        meta = extract_record_metadata(record)
        md_parts.append(build_markdown_abstract_page(recid, meta))
        html_parts.append(build_html_abstract_page(recid, meta))
    with open(output, "w") as out:
        out.write("\n\n---\n\n".join(md_parts) + "\n")
    with open(output_html, "w") as out:
        out.write(
            wrap_in_html_document("\n<hr>\n".join(html_parts), title="All Abstracts")
        )
    log_to_stderr(
        f"Merged {len(json_files_sorted)} abstracts → {output}, {output_html}"
    )

def combine_all_paper_bibtex(base_dir: str | None = None) -> None:
    """Merge all per-paper BibTeX files into combined summary files.\n
    Collects ``YYYY/*/*_bib.bib`` files and merges them (deduplicated)
    into ``all_papers.bib``, ``all_papers.md``, and ``all_papers.html``.\n
    Args:
        base_dir: Root directory to scan.  Defaults to the repository root.
    """
    if base_dir is None:
        base_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
    output = os.path.join(base_dir, "all_papers.bib")
    files = collect_bib_files_by_pattern(
        os.path.join("[0-9][0-9][0-9][0-9]", "*", "*_bib.bib"), base_dir
    )
    merge_bib_files_and_write(files, output, "paper")

def combine_all_references(base_dir: str | None = None) -> None:
    """Merge all per-paper reference files into combined summary files.\n
    Collects ``YYYY/*/*_references.bib`` files and merges them
    (deduplicated) into ``all_references.bib``, ``all_references.md``,
    and ``all_references.html``.\n
    Args:
        base_dir: Root directory to scan.  Defaults to the repository root.
    """
    if base_dir is None:
        base_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
    output = os.path.join(base_dir, "all_references.bib")
    files = collect_bib_files_by_pattern(
        os.path.join("[0-9][0-9][0-9][0-9]", "*", "*_references.bib"), base_dir
    )
    merge_bib_files_and_write(files, output, "reference")

def combine_all_citations(base_dir: str | None = None) -> None:
    """Merge all per-paper citation files into combined summary files.\n
    Collects ``YYYY/*/*_citations.bib`` files and merges them
    (deduplicated) into ``all_citations.bib``, ``all_citations.md``,
    and ``all_citations.html``.\n
    Args:
        base_dir: Root directory to scan.  Defaults to the repository root.
    """
    if base_dir is None:
        base_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
    output = os.path.join(base_dir, "all_citations.bib")
    files = collect_bib_files_by_pattern(
        os.path.join("[0-9][0-9][0-9][0-9]", "*", "*_citations.bib"), base_dir
    )
    merge_bib_files_and_write(files, output, "citation")

def run_all_merge_steps(base_dir: str | None = None) -> None:
    """Run all merge steps: abstracts, papers, references, and citations.\n
    Calls :func:`combine_all_abstracts`, :func:`combine_all_paper_bibtex`,
    :func:`combine_all_references`, and :func:`combine_all_citations` in
    sequence.\n
    Args:
        base_dir: Root directory to scan.  Defaults to the repository root.
    """
    combine_all_abstracts(base_dir)
    combine_all_paper_bibtex(base_dir)
    combine_all_references(base_dir)
    combine_all_citations(base_dir)

# ============================================================================
# BibTeX utilities (extract IDs from .bib files)
# ============================================================================

def collect_author_bib_files() -> list[str]:
    """Return sorted paths to all author .bib files in ``authors/``."""
    authors_dir = os.path.join(BASE_DIR, "authors")
    if not os.path.isdir(authors_dir):
        return []
    return sorted(glob.glob(os.path.join(authors_dir, "*.bib")))

def extract_arxiv_ids_from_files(file_paths: list[str]) -> list[str]:
    """Parse .bib files and extract arXiv IDs, sorted by date.\n
    Reads and merges all input files (deduplicating by citation key),
    then extracts the ``eprint`` field from each entry.  Entries without
    an ``eprint`` field are silently skipped.  The returned list is
    sorted by date (most recent first) because
    :func:`merge_bibtex_strings` sorts entries that way.\n
    Args:
        file_paths: List of .bib file paths to read.\n
    Returns:
        A list of arXiv ID strings sorted by date (most recent first).
    """
    bib_texts: list[str] = []
    for path in file_paths:
        with open(path) as f:
            bib_texts.append(f.read())
    merged = merge_bibtex_strings(*bib_texts)
    ids: list[str] = []
    for entry in merged:
        eprint = strip_outer_braces(entry.get("eprint", ""))
        if eprint:
            ids.append(eprint)
    return ids

def download_all_author_papers(force: bool = False) -> None:
    """Download all paper data for every author listed in ``authors.txt``.\n
    Runs in four phases:\n
    1. Download (or refresh) BibTeX for each author from INSPIRE-HEP.
    2. Merge all author BibTeX entries in memory and extract arXiv IDs.
    3. Download full paper data (PDF, abstract, BibTeX, references,
       citations) for each arXiv ID.
    4. Merge all per-paper info into repository-wide summary files
       (abstracts, papers, references, citations).\n
    Existing files are skipped unless *force* is ``True``.\n
    Args:
        force: If ``True``, re-download all files even if they exist.
    """
    # Phase 1: Download / refresh all author BibTeX files
    log_to_stderr("=" * 60)
    log_to_stderr("Phase 1: Downloading author BibTeX files")
    log_to_stderr("=" * 60)
    sync_all_author_bibs(force=force)
    #
    # Phase 2: Merge BibTeX entries and extract arXiv IDs
    log_to_stderr("")
    log_to_stderr("=" * 60)
    log_to_stderr("Phase 2: Merging author BibTeX entries")
    log_to_stderr("=" * 60)
    bib_files = collect_author_bib_files()
    if not bib_files:
        log_to_stderr("No author .bib files found, nothing to merge.")
        return
    #
    log_to_stderr(f"Found {len(bib_files)} author .bib files")
    arxiv_ids = extract_arxiv_ids_from_files(bib_files)
    log_to_stderr(f"Extracted {len(arxiv_ids)} unique arXiv IDs")
    #
    # Phase 3: Download full paper data for each arXiv ID
    log_to_stderr("")
    log_to_stderr("=" * 60)
    log_to_stderr("Phase 3: Downloading paper data")
    log_to_stderr("=" * 60)
    downloaded = 0
    failed = 0
    for i, arxiv_id in enumerate(arxiv_ids, 1):
        log_to_stderr(f"\n--- [{i}/{len(arxiv_ids)}] {arxiv_id} ---")
        try:
            download_all_paper_data(arxiv_id, force=force)
            downloaded += 1
        except Exception as e:
            log_to_stderr(f"  Failed to download {arxiv_id}: {e}")
            log_to_stderr("  Continuing with remaining papers...")
            failed += 1
    #
    log_to_stderr("")
    log_to_stderr("=" * 60)
    log_to_stderr(f"Downloaded {downloaded}/{len(arxiv_ids)} papers ({failed} failed).")
    #
    # Phase 4: Merge all paper info
    log_to_stderr("")
    log_to_stderr("=" * 60)
    log_to_stderr("Phase 4: Merging all paper info")
    log_to_stderr("=" * 60)
    run_all_merge_steps()
    log_to_stderr("")
    log_to_stderr("=" * 60)
    log_to_stderr("All done.")
