Source code for goodvibes.pes_loader

"""Format-agnostic PES loading pipeline.

Three stages, each independently testable:

    text  ──parse_legacy()/parse_yaml()──▶  PESSpec
    PESSpec + thermo_data ──build_pes_result()──▶  PESResult

`PESSpec` is the intermediate that both parsers produce: a description of
pathways, species (as file patterns), zero overrides, and options — but
*no* references to actual `calc_bbe` instances. The builder resolves
patterns against `thermo_data`, constructs `ConformerSet`s, and emits a
fully-realized `PESResult`.

The format dispatcher `load_pes()` sniffs the file content (presence of
`--- # PES`/`# SPECIES`/`# FORMAT` markers) to choose the parser; the
legacy path emits a `DeprecationWarning`.
"""
from __future__ import annotations

import fnmatch
import os
import warnings
from dataclasses import dataclass, field
from pathlib import Path
from typing import Dict, List, Optional, Union

from .pes_model import (
    ConformerSet, PESOptions, PESResult, Pathway, Point, parse_point_label,
)


# ---------------------------------------------------------------------------
# Intermediate: PESSpec
# ---------------------------------------------------------------------------

[docs] @dataclass class PESSpec: """Format-agnostic intermediate. Both parsers emit this. Attributes: pathways: pathway name -> ordered list of point labels. species: species name -> file pattern (glob or literal) or list of patterns. zero: pathway name -> point label to use as zero. Optional; pathways missing from this dict default to their first point. options: PESOptions parsed from the FORMAT block. format_extras: any FORMAT keys not consumed by PESOptions (kept verbatim so graph_reaction_profile can read them). """ pathways: Dict[str, List[str]] species: Dict[str, Union[str, List[str]]] zero: Dict[str, str] = field(default_factory=dict) options: PESOptions = field(default_factory=PESOptions) format_extras: Dict[str, str] = field(default_factory=dict)
# --------------------------------------------------------------------------- # Format detection # --------------------------------------------------------------------------- _LEGACY_MARKERS = {"PES", "SPECIES", "FORMAT"}
[docs] def is_legacy_format(text: str) -> bool: """Sniff for the `--- # PES` / `# SPECIES` / `# FORMAT` markers. Presence of any one of these markers anywhere in the file means legacy. Otherwise the file is treated as true YAML. """ for raw in text.splitlines(): line = raw.strip() if line.startswith("---") and "#" in line: comment = line.split("#", 1)[1].strip().upper() if comment in _LEGACY_MARKERS: return True return False
# --------------------------------------------------------------------------- # Pattern resolution # --------------------------------------------------------------------------- def _stem(path: str) -> str: """Basename without final extension. 'foo/bar.log' -> 'bar'.""" return os.path.splitext(os.path.basename(path))[0] def _parent_dir(path: str) -> str: """Basename of the parent directory. 'A/B/c.log' -> 'B'; 'c.log' -> ''.""" parent = os.path.dirname(path) return os.path.basename(parent) if parent else "" def _strip_ext(pattern: str) -> str: """Drop a trailing .log/.out so patterns can be written either way.""" for ext in (".log", ".out"): if pattern.endswith(ext): return pattern[: -len(ext)] return pattern #: Internal sentinel echoed from pes_yaml._DIR_PREFIX. Kept duplicated #: rather than imported to avoid a circular import at module load. _DIR_PREFIX = "@dir:"
[docs] def resolve_pattern(pattern: str, thermo_data: dict) -> List[str]: """Resolve a species pattern against the keys of `thermo_data`. Two pattern flavors: * **File patterns** (default) — matched against the basename of each key with the file extension stripped. Supports fnmatch globs; a pattern with no glob characters must equal the stem exactly. * **Directory patterns** — strings prefixed with `@dir:` (set automatically by the YAML loader when the user writes `{dir: "X"}` / `{dirs: [...]}`). Matched against the basename of each key's *parent directory*. Supports fnmatch globs. Returns the matched thermo_data keys in their original (insertion) order. """ if pattern.startswith(_DIR_PREFIX): dir_pat = pattern[len(_DIR_PREFIX):].strip() has_glob = any(c in dir_pat for c in "*?[") matches: List[str] = [] for key in thermo_data: parent = _parent_dir(key) if has_glob: if fnmatch.fnmatch(parent, dir_pat): matches.append(key) elif parent == dir_pat: matches.append(key) return matches pat = _strip_ext(pattern.strip()) has_glob = any(c in pat for c in "*?[") matches = [] for key in thermo_data: stem = _stem(key) if has_glob: if fnmatch.fnmatch(stem, pat): matches.append(key) elif stem == pat: matches.append(key) return matches
def _resolve_species( name: str, pattern: Union[str, List[str]], thermo_data: dict, ) -> ConformerSet: """Build a ConformerSet for one species by resolving its pattern(s).""" patterns = [pattern] if isinstance(pattern, str) else list(pattern) files: List[str] = [] for p in patterns: files.extend(resolve_pattern(p, thermo_data)) # De-duplicate while preserving order (a species can list overlapping globs). seen = set() deduped = [f for f in files if not (f in seen or seen.add(f))] if not deduped: raise ValueError( f"species {name!r}: no files matched pattern(s) {patterns!r}. " f"available stems: {sorted({_stem(k) for k in thermo_data})}" ) bbes = [thermo_data[f] for f in deduped] return ConformerSet(name=name, files=deduped, bbes=bbes) # --------------------------------------------------------------------------- # Builder: PESSpec + thermo_data -> PESResult # ---------------------------------------------------------------------------
[docs] def build_pes_result( spec: PESSpec, thermo_data: dict, temperatures: Optional[List[float]] = None, ) -> PESResult: """Resolve patterns, build ConformerSets, parse point labels, return PESResult. All species referenced (transitively) by the pathways' points must resolve to at least one file in `thermo_data`; unresolved names raise `KeyError`. """ # 1. Build the species map by resolving every species mentioned anywhere. needed: set = set() for points in spec.pathways.values(): for point_label in points: for _, name in parse_point_label(point_label): needed.add(name) for zero_label in spec.zero.values(): for _, name in parse_point_label(zero_label): needed.add(name) species_map: Dict[str, ConformerSet] = {} for name in needed: if name not in spec.species: raise KeyError( f"species {name!r} referenced by a pathway but not defined " f"in the SPECIES block (defined: {sorted(spec.species)})" ) species_map[name] = _resolve_species(name, spec.species[name], thermo_data) # 2. Build pathways. For each pathway, parse every point, resolve zero. pathways: List[Pathway] = [] for path_name, point_labels in spec.pathways.items(): points = [Point.from_label(label, species_map) for label in point_labels] if path_name in spec.zero: zero_point = Point.from_label(spec.zero[path_name], species_map) else: zero_point = points[0] pathways.append(Pathway(name=path_name, points=points, zero=zero_point)) return PESResult( pathways=pathways, options=spec.options, temperatures=temperatures or [298.15], )
# --------------------------------------------------------------------------- # Top-level dispatcher # ---------------------------------------------------------------------------
[docs] def load_pes( path: str, thermo_data: dict, temperatures: Optional[List[float]] = None, ) -> PESResult: """Read a PES definition file, parse it, and return a `PESResult`. Sniffs the file format (legacy `--- # PES` markers vs proper YAML); legacy emits a `DeprecationWarning`. """ text = Path(path).read_text() if is_legacy_format(text): warnings.warn( f"PES file {path!r} uses the legacy line-based format. " "This format is deprecated and will be removed in v5.1; " "see ROADMAP.md Sub-plan B for the new YAML schema.", DeprecationWarning, stacklevel=2, ) from .pes_legacy import parse_legacy spec = parse_legacy(text) else: from .pes_yaml import parse_yaml spec = parse_yaml(text) return build_pes_result(spec, thermo_data, temperatures=temperatures)