Source code for goodvibes.selectivity

"""Boltzmann weighting and selectivity calculations for GoodVibes."""
import fnmatch
import logging
import math
import os.path
import sys
import warnings
from dataclasses import dataclass
from glob import glob
from typing import Dict, List, Optional

from .constants import GAS_CONSTANT, J_TO_AU, KCAL_TO_AU
from .sort import SORT_KEYS

log = logging.getLogger('goodvibes')


# ---------------------------------------------------------------------------
# Structured selectivity result (v4.2+)
# ---------------------------------------------------------------------------


[docs]
@dataclass(frozen=True)
class SelectivityResult:
    """N-species selectivity outcome at one temperature.

    Numeric data only — formatting strings (e.g. "60:40", "1.5:1") are
    derived in the print layer from `populations`. Pairwise data are not
    stored here: for N=2, ee + ddG suffice; for N>2, downstream consumers
    derive any ratios they need from `populations`.
    """
    temperature: float                          # Kelvin
    key: str                                    # 'gibbs' | 'energy'
    labels: List[str]                           # ordered species names
    files_per_label: Dict[str, List[str]]       # species -> file paths
    populations: Dict[str, float]               # normalized: Σ = 1.0
    raw_boltzmann: Dict[str, float]             # un-normalized e^(-ΔG/RT)
    preferred: str                              # max-population label
    ee: Optional[float] = None                  # 2-label only: (a-b)*100, in %
    ddG: Optional[float] = None                 # 2-label only: ΔΔG‡ in Hartree



# ---------------------------------------------------------------------------
# Label-spec parsing
# ---------------------------------------------------------------------------


[docs]
def parse_label_args(label_args):
    """Parse repeatable --label NAME=PATTERN args into an ordered dict.

    Parameters:
        label_args (list[str] or None): values from argparse for --label.

    Returns:
        dict[str, str]: ordered mapping label -> fnmatch glob pattern.
    """
    if not label_args:
        return {}
    spec = {}
    for arg in label_args:
        if '=' not in arg:
            raise ValueError(
                f"Invalid --label {arg!r}: expected NAME=PATTERN "
                "(e.g. --label R='*P_R_*')."
            )
        name, _, pattern = arg.partition('=')
        name = name.strip()
        pattern = pattern.strip()
        if not name or not pattern:
            raise ValueError(
                f"Invalid --label {arg!r}: NAME and PATTERN must both be "
                "non-empty."
            )
        if name in spec:
            raise ValueError(f"Duplicate --label name {name!r}.")
        spec[name] = pattern
    return spec




[docs]
def load_label_yaml(path):
    """Load a selectivity YAML file.

    Supports two shapes (mutually exclusive):
        labels: { R: '*P_R_*', S: '*P_S_*' }            # fnmatch patterns
        files:  { R: [a.log, b.log], S: [c.log, ...] }  # explicit file lists

    Returns:
        (mode, dict): mode is 'patterns' or 'files'; dict maps label to its
        spec (string pattern, or list of file paths).
    """
    try:
        import yaml
    except ImportError:
        raise RuntimeError(
            "PyYAML is required for --selectivity. Install with "
            "`pip install pyyaml` or use --label instead."
        )
    with open(path, 'r', encoding='utf-8') as f:
        data = yaml.safe_load(f) or {}
    if 'labels' in data and 'files' in data:
        raise ValueError(
            f"{path}: provide either 'labels' (patterns) OR 'files' "
            "(explicit lists), not both."
        )
    if 'labels' in data:
        return 'patterns', dict(data['labels'])
    if 'files' in data:
        return 'files', {k: list(v) for k, v in data['files'].items()}
    raise ValueError(
        f"{path}: top-level key 'labels' or 'files' is required."
    )




[docs]
def assign_files_to_labels(files, label_patterns):
    """Group file paths by fnmatch against each label's pattern.

    Each pattern is tested against TWO candidates per file, in order:

    1. the file's basename (e.g. ``DA_exo_12_i.out``) — matches when
       species are encoded in the filename, the v4.x default.
    2. the basename of the file's immediate parent directory
       (e.g. ``exo``) — matches when species are organized into
       per-species subdirectories.

    A file is assigned to the **first label** whose pattern matches
    either candidate. So a layout like ``exo/DA_*.out`` ``endo/DA_*.out``
    works with ``--label exo=exo --label endo=endo`` (parent-dir
    match), and the original ``--label exo='*_exo_*'`` (basename
    match) keeps working unchanged.

    Parameters:
        files (Iterable[str]): file paths to species.
        label_patterns (dict): ordered label -> fnmatch glob pattern.

    Returns:
        dict[str, list[str]]: label -> list of matching file paths.
    """
    speciess = {label: [] for label in label_patterns}
    for file in files:
        base = os.path.basename(file)
        parent = os.path.basename(os.path.dirname(file))
        for label, pattern in label_patterns.items():
            if (fnmatch.fnmatch(base, pattern)
                    or (parent and fnmatch.fnmatch(parent, pattern))):
                speciess[label].append(file)
                break
    return speciess



# ---------------------------------------------------------------------------
# Core compute
# ---------------------------------------------------------------------------

def _excluded_files(dup_list):
    """Set of file paths to exclude from sums based on a dup_list.

    Convention from `sort.deduplicate`: each pair is [duplicate, canonical].
    The first element is the redundant copy that should be dropped; the
    second is the kept structure. Excluding only the first member matches
    the existing behavior of `get_boltz` and `get_selectivity` and avoids
    over-counting losses.
    """
    excluded = set()
    if not dup_list:
        return excluded
    for pair in dup_list:
        if pair:
            excluded.add(pair[0])
    return excluded



[docs]
def compute_selectivity(thermo_data, files_per_label, temperature,
                        dup_list=None, key='gibbs'):
    """Compute populations and (for N=2) ee + ΔΔG‡ for a labeled species set.

    Parameters:
        thermo_data (dict): file path -> calc_bbe.
        files_per_label (dict): ordered label -> list of file paths.
        temperature (float): Kelvin.
        dup_list (list, optional): pairs of duplicate files to exclude
            from sums.
        key (str): 'gibbs' (qh_gibbs_free_energy) or 'energy' (scf_energy).

    Returns:
        SelectivityResult.

    Raises:
        ValueError: if any label has no files, or if no files have a usable
            energy attribute.
    """
    attr = SORT_KEYS[key]
    excluded = _excluded_files(dup_list)
    labels = list(files_per_label.keys())
    if len(labels) < 2:
        raise ValueError("Selectivity needs at least two labels.")

    empty = [label for label in labels if not files_per_label[label]]
    if empty:
        raise ValueError(
            f"No files matched the following label(s): {', '.join(empty)}. "
            "Check the patterns or file lists in your selectivity spec."
        )

    # Find the global minimum energy (across all labeled files we'll keep) so
    # we can shift before exponentiating to avoid float overflow / underflow.
    e_min = math.inf
    for label in labels:
        for file in files_per_label[label]:
            if file in excluded:
                continue
            bbe = thermo_data.get(file)
            if bbe is None:
                continue
            val = getattr(bbe, attr, None)
            if val is not None and val < e_min:
                e_min = val
    if not math.isfinite(e_min):
        raise ValueError(
            "No files in any label had a usable energy attribute "
            f"({attr}); cannot compute selectivity."
        )

    raw = {label: 0.0 for label in labels}
    rt = GAS_CONSTANT * temperature  # J/mol
    for label in labels:
        for file in files_per_label[label]:
            if file in excluded:
                continue
            bbe = thermo_data.get(file)
            if bbe is None:
                continue
            val = getattr(bbe, attr, None)
            if val is None:
                continue
            raw[label] += math.exp(-(val - e_min) * J_TO_AU / rt)

    total = sum(raw.values())
    if total == 0.0:
        raise ValueError(
            "Boltzmann sums are zero across all labels; cannot compute "
            "selectivity."
        )

    populations = {label: raw[label] / total for label in labels}
    preferred = max(populations, key=populations.get)

    ee = None
    ddG = None
    if len(labels) == 2:
        a, b = labels
        pa = populations[a]
        pb = populations[b]
        ee = abs(pa - pb) * 100.0
        # ΔΔG‡ = RT ln(p_major / p_minor), in Hartree, positive by convention
        # (the gap between major and minor TS). None when one species is
        # empty enough that the ratio diverges.
        if pa > 0 and pb > 0:
            ddG = rt * math.log(max(pa, pb) / min(pa, pb)) / J_TO_AU
        else:
            ddG = None

    return SelectivityResult(
        temperature=temperature,
        key=key,
        labels=labels,
        files_per_label={label: list(files_per_label[label]) for label in labels},
        populations=populations,
        raw_boltzmann=raw,
        preferred=preferred,
        ee=ee,
        ddG=ddG,
    )




[docs]
def compute_selectivity_scan(thermo_data, files_per_label, temperatures,
                             dup_list=None, key='gibbs'):
    """Compute a SelectivityResult at each temperature in `temperatures`.

    Convenience wrapper that pairs naturally with --ti temperature
    intervals; the species grouping is fixed across temperatures.
    """
    return [
        compute_selectivity(thermo_data, files_per_label, T,
                            dup_list=dup_list, key=key)
        for T in temperatures
    ]



def _lowest_per_label(thermo_data, files_per_label, dup_list, key):
    """Return a {label: [single_lowest_file]} reduction of files_per_label.

    For each species, picks the conformer with the lowest energy attribute,
    skipping files in dup_list[k][0] and files with no usable energy.
    """
    attr = SORT_KEYS[key]
    excluded = _excluded_files(dup_list)
    minimal = {}
    for label, files in files_per_label.items():
        best, best_val = None, math.inf
        for f in files:
            if f in excluded:
                continue
            bbe = thermo_data.get(f)
            if bbe is None:
                continue
            val = getattr(bbe, attr, None)
            if val is None:
                continue
            if val < best_val:
                best, best_val = f, val
        minimal[label] = [best] if best is not None else []
    return minimal



[docs]
def compute_selectivity_lowest_only(thermo_data, files_per_label, temperature,
                                     dup_list=None, key='gibbs'):
    """Selectivity using only the most stable conformer per species.

    For each label, picks the file with the lowest `key` (default
    qh_gibbs_free_energy), drops the rest, and runs the standard
    Boltzmann calc on the resulting 1-conformer-per-species set. The
    result complements compute_selectivity by showing how much of the
    selectivity comes from conformer mixing versus the gap between the
    lowest TSs.
    """
    minimal = _lowest_per_label(thermo_data, files_per_label, dup_list, key)
    return compute_selectivity(thermo_data, minimal, temperature,
                                 dup_list=dup_list, key=key)




[docs]
def compute_selectivity_lowest_only_scan(thermo_data, files_per_label,
                                           temperatures, dup_list=None,
                                           key='gibbs'):
    """Lowest-only selectivity at each temperature in `temperatures`."""
    minimal = _lowest_per_label(thermo_data, files_per_label, dup_list, key)
    return [compute_selectivity(thermo_data, minimal, T,
                                  dup_list=dup_list, key=key)
            for T in temperatures]



# ---------------------------------------------------------------------------
# Boltzmann weighting (unchanged)
# ---------------------------------------------------------------------------


[docs]
def get_boltz(thermo_data, temperature, dup_list, key='gibbs'):
    """Produce normalized Boltzmann populations across all files.

    Used by `--boltz` for the per-file population display and by the
    legacy `--ee` selectivity flow (which still calls `get_selectivity`).

    Parameters:
        thermo_data (dict): file path -> calc_bbe.
        temperature (float): Kelvin.
        dup_list (list): pairs [file_i, file_j]; entries whose path is the
            FIRST member of any pair are excluded (legacy behavior).
        key (str): 'gibbs' or 'energy'.

    Returns:
        dict[str, float]: file path -> normalized population (Σ = 1.0).
    """
    attr = SORT_KEYS[key]
    files = list(thermo_data)
    boltz_facs, e_min, boltz_sum = {}, sys.float_info.max, 0.0

    for file in files:  # Need the most stable structure
        val = getattr(thermo_data[file], attr, None)
        if val is not None and val < e_min:
            e_min = val

    # Calculate E_rel and Boltzmann factors
    for file in files:
        duplicate = False
        if dup_list:
            for dup in dup_list:
                if dup[0] == file:
                    duplicate = True
        if not duplicate:
            val = getattr(thermo_data[file], attr, None)
            if val is not None:
                boltz_facs[file] = math.exp(-(val - e_min) * J_TO_AU / GAS_CONSTANT / temperature)
                boltz_sum += boltz_facs[file]

    # Normalize to populations that sum to 1.0
    if boltz_sum > 0:
        for file in boltz_facs:
            boltz_facs[file] /= boltz_sum

    return boltz_facs



# ---------------------------------------------------------------------------
# Legacy --ee shim (deprecated; remove in v5.0)
# ---------------------------------------------------------------------------


[docs]
def get_selectivity(pattern, files, boltz_facs, temperature, dup_list):
    """DEPRECATED — legacy 'a:b' colon-pattern selectivity.

    Forwards to `compute_selectivity` with a 2-label spec built from the
    glob pattern. Emits a DeprecationWarning. Use --label / --selectivity
    going forward.

    Returns the legacy 6-tuple: (ee, er, ratio, dd_free_energy, failed,
    pref) so the existing CLI print path keeps working unchanged.
    """
    warnings.warn(
        "get_selectivity / --ee is deprecated; use --label / "
        "--selectivity instead. Will be removed in v5.0.",
        DeprecationWarning,
        stacklevel=2,
    )
    parts = pattern.split(':')
    if len(parts) != 2:
        raise ValueError(
            f"Invalid selectivity pattern '{pattern}'. "
            "Expected format: 'pattern_a:pattern_b' with exactly one colon."
        )
    a_regex, b_regex = parts[0].strip(), parts[1].strip()
    if not a_regex or not b_regex:
        raise ValueError(
            f"Invalid selectivity pattern '{pattern}'. "
            "Both patterns before and after ':' must be non-empty."
        )

    A = ''.join(a for a in a_regex if a.isalnum())
    B = ''.join(b for b in b_regex if b.isalnum())

    # Legacy behavior: glob the filesystem (one or more parent dirs), not
    # fnmatch against `files`.
    dirs = list(set(os.path.dirname(f) for f in files))
    a_files, b_files = [], []
    if len(dirs) > 1 or (dirs and dirs[0] != ''):
        for d in dirs:
            a_files.extend(glob(d + '/' + a_regex))
            b_files.extend(glob(d + '/' + b_regex))
    else:
        a_files.extend(glob(a_regex))
        b_files.extend(glob(b_regex))

    if not a_files or not b_files:
        log.info("\n   Warning! Filenames have not been formatted correctly for determining selectivity\n")
        log.info("   Make sure the filename contains either " + A + " or " + B + "\n")
        sys.exit("   Please edit either your filenames or selectivity pattern argument and try again\n")

    # Restrict to files we have thermo data for, and that haven't been
    # marked as duplicates (legacy: only dup[0] is excluded here).
    excluded = set(dup[0] for dup in (dup_list or []))
    a_files = [f for f in a_files if f in boltz_facs and f not in excluded]
    b_files = [f for f in b_files if f in boltz_facs and f not in excluded]
    a_sum = sum(boltz_facs[f] for f in a_files)
    b_sum = sum(boltz_facs[f] for f in b_files)

    A_round = round(a_sum * 100)
    B_round = round(b_sum * 100)
    er = f'{A_round}:{B_round}'
    failed = False
    if a_sum > b_sum:
        pref = A
        try:
            r = a_sum / b_sum
            ratio = f'{round(r, 1)}:1' if r < 3 else f'{round(r)}:1'
        except ZeroDivisionError:
            ratio = '1:0'
    else:
        pref = B
        try:
            r = b_sum / a_sum
            ratio = f'1:{round(r, 1)}' if r < 3 else f'1:{round(r)}'
        except ZeroDivisionError:
            ratio = '0:1'

    ee = (a_sum - b_sum) * 100.0
    if ee == 0:
        log.info("\n   Warning! No files found for an enantioselectivity analysis, adjust the stereodetermining step name and try again.\n")
        failed = True
    ee = abs(ee)
    try:
        dd_free_energy = GAS_CONSTANT / J_TO_AU * temperature * math.log((50 + ee / 2.0) / (50 - ee / 2.0)) * KCAL_TO_AU
    except (ZeroDivisionError, ValueError):
        dd_free_energy = 0.0
    return ee, er, ratio, dd_free_energy, failed, pref