Source code for goodvibes.pes_yaml

"""Parser for the new (true YAML) PES format.

Schema:

    pathways:
      Reaction: ["Int-I + TolS + TolSH", "Int-II + TolSH", "Int-III"]

    species:
      Int-I:    {files: "Int-I_*.log"}
      TolS:     {files: ["TolS.log"]}
      Int-II:   {files: "Int-II_*.log"}

    zero:
      Reaction: "Int-I + TolS + TolSH"          # optional; defaults to points[0]

    format:
      units: kcal/mol
      decimals: 1

`pathways` is a dict of pathway-name → ordered list of point label strings.
`species` is a dict of species-name → {files: <glob string OR list>} (the
dict shape leaves room for per-species options later — scaling factors,
symmetry numbers — without breaking the schema). `zero` is optional and
applies per pathway. `format` parses into `PESOptions`.
"""
from __future__ import annotations

from typing import Dict, List, Tuple, Union

from .pes_legacy import _replace
from .pes_loader import PESSpec
from .pes_model import PESOptions


[docs] def parse_yaml(text: str) -> PESSpec: """Parse the YAML text into a `PESSpec`. PyYAML is required.""" try: import yaml except ImportError: raise RuntimeError( "PyYAML is required to read modern PES YAML files. Install with " "`pip install pyyaml`, or use the legacy `--- # PES` format." ) data = yaml.safe_load(text) if not isinstance(data, dict): raise ValueError("PES YAML root must be a mapping") if "pathways" not in data: raise ValueError("PES YAML must define a top-level `pathways:` block") if "species" not in data: raise ValueError("PES YAML must define a top-level `species:` block") pathways = _parse_pathways(data["pathways"]) species = _parse_species(data["species"]) zero = _parse_zero(data.get("zero", {}), pathways) options, extras = _parse_format(data.get("format", {})) return PESSpec( pathways=pathways, species=species, zero=zero, options=options, format_extras=extras, )
def _parse_pathways(raw) -> Dict[str, List[str]]: if not isinstance(raw, dict): raise ValueError(f"`pathways` must be a mapping, got {type(raw).__name__}") pathways: Dict[str, List[str]] = {} for name, points in raw.items(): if not isinstance(points, list): raise ValueError( f"pathway {name!r}: points must be a list of strings, " f"got {type(points).__name__}" ) cleaned = [str(p).strip() for p in points if str(p).strip()] if not cleaned: raise ValueError(f"pathway {name!r} has no points") pathways[str(name)] = cleaned if not pathways: raise ValueError("`pathways` is empty") return pathways def _parse_species(raw) -> Dict[str, Union[str, List[str]]]: if not isinstance(raw, dict): raise ValueError(f"`species` must be a mapping, got {type(raw).__name__}") species: Dict[str, Union[str, List[str]]] = {} for name, entry in raw.items(): files = _extract_files(name, entry) species[str(name)] = files if not species: raise ValueError("`species` is empty") return species #: Internal prefix used to encode a "match this directory" pattern in the #: same `Union[str, List[str]]` value space as file globs. Users never see #: it — they write `{dir: "X"}` in YAML and the loader translates. _DIR_PREFIX = "@dir:" def _extract_files(name: str, entry) -> Union[str, List[str]]: """A species entry can be: {files: "glob"} # match by filename stem (default) {files: ["a.log", "b.log", ...]} {dir: "subdir"} # match files whose parent dir basename matches {dirs: ["sub_a", "sub_b", ...]} # multiple directories {files: [...], dir: "X"} # combined: union of both rules "glob" # shorthand: bare string (file pattern) ["a.log", ...] # shorthand: bare list (file patterns) Directory entries can themselves be fnmatch globs (e.g. `dir: "TS_*"`). A trailing "/*" or "/**" on a directory name is dropped automatically so users can write either `dir: "X"` or `dir: "X/*"`. """ if isinstance(entry, str): return entry if isinstance(entry, list): return [str(x) for x in entry] if isinstance(entry, dict): if "files" not in entry and "dir" not in entry and "dirs" not in entry: raise ValueError( f"species {name!r}: needs at least one of `files:`, `dir:`, " "or `dirs:`" ) out: List[str] = [] if "files" in entry: files = entry["files"] if isinstance(files, str): out.append(files) elif isinstance(files, list): out.extend(str(x) for x in files) else: raise ValueError( f"species {name!r}: `files:` must be a string or list, " f"got {type(files).__name__}" ) if "dir" in entry: d = entry["dir"] if not isinstance(d, str): raise ValueError( f"species {name!r}: `dir:` must be a string, " f"got {type(d).__name__}" ) out.append(_DIR_PREFIX + _normalize_dir(d)) if "dirs" in entry: dirs = entry["dirs"] if not isinstance(dirs, list): raise ValueError( f"species {name!r}: `dirs:` must be a list of strings, " f"got {type(dirs).__name__}" ) for d in dirs: if not isinstance(d, str): raise ValueError( f"species {name!r}: `dirs:` entries must be strings, " f"got {type(d).__name__}" ) out.append(_DIR_PREFIX + _normalize_dir(d)) # Single string vs list — preserve old single-string shape for # back-compat with external code that introspects PESSpec.species. return out[0] if len(out) == 1 else out raise ValueError( f"species {name!r}: entry must be a string, list, or mapping, " f"got {type(entry).__name__}" ) def _normalize_dir(d: str) -> str: """Strip trailing slashes / globs so `dir: "X/"` / `dir: "X/*"` / `dir: "X/**"` all normalize to `"X"`.""" d = d.strip().rstrip("/") for tail in ("/**", "/*"): if d.endswith(tail): d = d[: -len(tail)] return d def _parse_zero(raw, pathways: Dict[str, List[str]]) -> Dict[str, str]: if not raw: return {} if not isinstance(raw, dict): raise ValueError(f"`zero` must be a mapping, got {type(raw).__name__}") zeros: Dict[str, str] = {} for path_name, label in raw.items(): if path_name not in pathways: raise ValueError( f"`zero.{path_name}` references unknown pathway " f"(known: {sorted(pathways)})" ) zeros[str(path_name)] = str(label).strip() return zeros def _parse_format(raw) -> Tuple[PESOptions, Dict[str, str]]: if not raw: return PESOptions(), {} if not isinstance(raw, dict): raise ValueError(f"`format` must be a mapping, got {type(raw).__name__}") options = PESOptions() extras: Dict[str, str] = {} for key, value in raw.items(): key_lower = str(key).lower() if key_lower == "units": if value not in ("kcal/mol", "kJ/mol"): raise ValueError( f"format.units: expected 'kcal/mol' or 'kJ/mol', got {value!r}" ) options = _replace(options, units=value) elif key_lower in ("decimals", "dec"): try: options = _replace(options, decimals=int(value)) except (ValueError, TypeError): raise ValueError(f"format.decimals: expected integer, got {value!r}") else: extras[str(key)] = str(value) return options, extras