Source code for goodvibes.pes_legacy

"""Parser for the legacy line-based PES format.

The format predates v4.2; despite the `.yaml` extension it isn't valid YAML
(parens in species names like `Cu(III)-S` and the `[a, b, c]` PES point
lists with bare identifiers would either fail or parse strangely).

Sections are introduced by `--- # NAME` comment markers:

    --- # PES
        Reaction: [Int-I + TolS, Int-II, Int-III]
    --- # SPECIES
        Int-I: Int-I_*
    --- # FORMAT
        units: kcal/mol
        dec: 1
        zero: Int-I + TolS

The parser produces a `PESSpec`; format keys it doesn't consume (graph
styling — dpi, color, title, etc.) are preserved in `format_extras` so
`graph_reaction_profile` can keep reading them.
"""
from __future__ import annotations

from typing import Dict, List, Optional, Tuple

from .pes_model import PESOptions
from .pes_loader import PESSpec


# Section terminators: a `---` line OR end of file.
def _split_sections(text: str) -> Dict[str, List[str]]:
    """Split the input into named sections {PES, SPECIES, FORMAT}.

    Lines under each section header are returned with leading/trailing
    whitespace and pure-comment lines stripped. Blank lines are kept so
    a downstream parser can tell that a list ended.
    """
    sections: Dict[str, List[str]] = {}
    current: Optional[str] = None
    for raw in text.splitlines():
        stripped = raw.strip()
        if stripped.startswith("---"):
            current = None
            if "#" in stripped:
                marker = stripped.split("#", 1)[1].strip().upper()
                if marker in {"PES", "SPECIES", "FORMAT"}:
                    current = marker
                    sections.setdefault(current, [])
            continue
        if current is None:
            continue
        if stripped.startswith("#"):
            continue
        sections[current].append(raw)
    return sections


def _split_kv(line: str) -> Optional[Tuple[str, str]]:
    """Split a `key: value` or `key = value` line. Returns None if not a kv line."""
    s = line.strip()
    if not s:
        return None
    # Allow either separator. We replace ':' with '=' so a single split works,
    # but only for the first occurrence — values can contain ':' in colors etc.
    if ":" in s and ("=" not in s or s.index(":") < s.index("=")):
        key, _, value = s.partition(":")
    elif "=" in s:
        key, _, value = s.partition("=")
    else:
        return None
    return key.strip(), value.strip()


def _parse_pathways(lines: List[str]) -> Dict[str, List[str]]:
    """Parse the PES section into {pathway_name: [point_label, ...]}.

    Each non-blank line is `name: [p1, p2, ...]`. Blank lines and comment
    lines are skipped.
    """
    pathways: Dict[str, List[str]] = {}
    for line in lines:
        kv = _split_kv(line)
        if kv is None:
            continue
        name, value = kv
        if not (value.startswith("[") and value.endswith("]")):
            raise ValueError(
                f"PES section: pathway {name!r} value must be a [bracketed list], "
                f"got {value!r}"
            )
        inner = value[1:-1]
        # Split on commas but preserve `+` — point labels are e.g. "A + B".
        points = [p.strip() for p in inner.split(",")]
        points = [p for p in points if p]
        if not points:
            raise ValueError(f"PES section: pathway {name!r} has no points")
        pathways[name] = points
    if not pathways:
        raise ValueError("PES section is empty or missing")
    return pathways


def _parse_species(lines: List[str]) -> Dict[str, str]:
    """Parse the SPECIES section into {species_name: pattern}.

    `folder: PATH` lines are ignored (legacy used these to prefix paths;
    the modern orchestrator already provides full paths in thermo_data).
    """
    species: Dict[str, str] = {}
    for line in lines:
        kv = _split_kv(line)
        if kv is None:
            continue
        key, value = kv
        if key.lower() == "folder":
            continue
        if not value:
            raise ValueError(f"SPECIES section: empty pattern for {key!r}")
        species[key] = value
    if not species:
        raise ValueError("SPECIES section is empty or missing")
    return species


def _parse_format(lines: List[str]) -> Tuple[PESOptions, Dict[str, str], Dict[str, str]]:
    """Parse the FORMAT section. Returns (PESOptions, zero overrides, extras).

    Recognised keys:
        units    -> options.units
        dec      -> options.decimals
        zero     -> applies to *all* pathways (legacy never supported per-pathway)
    Any other key is preserved verbatim in `extras` for downstream consumers
    (graph_reaction_profile reads dpi, color, title, etc.).
    """
    options = PESOptions()
    zero_default: Optional[str] = None
    extras: Dict[str, str] = {}
    for line in lines:
        kv = _split_kv(line)
        if kv is None:
            continue
        key, value = kv
        key_lower = key.lower()
        if key_lower == "units":
            if value not in ("kcal/mol", "kJ/mol"):
                raise ValueError(
                    f"FORMAT.units: expected 'kcal/mol' or 'kJ/mol', got {value!r}"
                )
            options = _replace(options, units=value)
        elif key_lower == "dec":
            try:
                options = _replace(options, decimals=int(value))
            except ValueError:
                raise ValueError(f"FORMAT.dec: expected integer, got {value!r}")
        elif key_lower == "zero":
            zero_default = value
        else:
            extras[key] = value
    zeros: Dict[str, str] = {}
    if zero_default is not None:
        zeros["__default__"] = zero_default      # applied to all pathways at build time
    return options, zeros, extras


def _replace(options: PESOptions, **kwargs) -> PESOptions:
    return PESOptions(
        units=kwargs.get("units", options.units),
        decimals=kwargs.get("decimals", options.decimals),
        gconf=kwargs.get("gconf", options.gconf),
        QH=kwargs.get("QH", options.QH),
        spc_used=kwargs.get("spc_used", options.spc_used),
        lowest_only=kwargs.get("lowest_only", options.lowest_only),
    )


[docs] def parse_legacy(text: str) -> PESSpec: """Parse the legacy line-based PES format into a `PESSpec`.""" sections = _split_sections(text) if "PES" not in sections: raise ValueError("legacy PES file is missing the `--- # PES` section") if "SPECIES" not in sections: raise ValueError("legacy PES file is missing the `--- # SPECIES` section") pathways = _parse_pathways(sections["PES"]) species = _parse_species(sections["SPECIES"]) options = PESOptions() zero_default: Dict[str, str] = {} extras: Dict[str, str] = {} if "FORMAT" in sections: options, raw_zeros, extras = _parse_format(sections["FORMAT"]) if "__default__" in raw_zeros: zero_default = {p: raw_zeros["__default__"] for p in pathways} return PESSpec( pathways=pathways, species=species, zero=zero_default, options=options, format_extras=extras, )