Source code for goodvibes.sort

"""Structure sorting and duplicate detection for GoodVibes."""
import logging
import numpy as np

from .constants import KCAL_TO_AU

log = logging.getLogger('goodvibes')



[docs]
def kabsch_rmsd(coords_a, coords_b):
    """
    Compute the RMSD between two Nx3 Cartesian coordinate sets after optimal rigid alignment using the Kabsch algorithm.
    
    Parameters:
        coords_a (array-like): Reference coordinates with shape (N, 3).
        coords_b (array-like): Mobile coordinates with shape (N, 3).
    
    Returns:
        float: Root-mean-square deviation between the aligned coordinates, in the same units as the inputs.
    """
    a = np.array(coords_a, dtype=float)
    b = np.array(coords_b, dtype=float)
    # Center both structures
    a -= a.mean(axis=0)
    b -= b.mean(axis=0)
    # Kabsch: find optimal rotation via SVD of cross-covariance matrix
    H = b.T @ a
    U, _, Vt = np.linalg.svd(H)
    # Correct for reflection
    d = np.linalg.det(Vt.T @ U.T)
    sign_matrix = np.diag([1.0, 1.0, d])
    R = Vt.T @ sign_matrix @ U.T
    b_aligned = b @ R.T
    return np.sqrt(np.mean((a - b_aligned) ** 2))




[docs]
def deduplicate(thermo_data, *, e_cutoff=0.05, ro_cutoff=0.01,
                rmsd_cutoff=None):
    """Identify duplicate or enantiomeric structures by comparing energies,
    rotational constants, and optionally Cartesian RMSD.

    All active criteria must pass for a pair to be flagged as duplicate.

    Parameters:
        thermo_data (dict): file path → calc_bbe mapping.
        e_cutoff (float): max absolute SCF energy difference in kcal/mol (default 0.05).
        ro_cutoff (float): max relative difference in rotational constants as a
            fraction, e.g. 0.01 = 1% (default 0.01).
        rmsd_cutoff (float or None): max Cartesian RMSD in Angstrom.
            None (default) disables RMSD comparison; 0.125 matches CREST.

    Returns:
        list: pairs [file_i, file_j] flagged as duplicates.
    """
    files = list(thermo_data)
    dup_list = []
    e_cutoff_au = e_cutoff / KCAL_TO_AU  # Convert kcal/mol to Hartree
    cutoff_msg = "\n   Checking for duplicate structures. Applying: e_cutoff={} kcal/mol, ro_cutoff={}%".format(e_cutoff, ro_cutoff * 100)
    if rmsd_cutoff is not None:
        cutoff_msg += ", rmsd_cutoff={} A".format(rmsd_cutoff)
    log.info("\n" + cutoff_msg)
    for i, file in enumerate(files):
        for j in range(0, i):
            bbe_i, bbe_j = thermo_data[files[i]], thermo_data[files[j]]

            # Energy gate (cheap): reject the pair before computing ro_diff or RMSD.
            if not (hasattr(bbe_i, "scf_energy") and hasattr(bbe_j, "scf_energy")):
                continue
            if bbe_i.scf_energy is None or bbe_j.scf_energy is None:
                continue
            if abs(bbe_i.scf_energy - bbe_j.scf_energy) >= e_cutoff_au:
                continue

            # Rotational-constants gate (microseconds): only run if energy passed.
            if not (hasattr(bbe_i, "roconst") and hasattr(bbe_j, "roconst")):
                continue
            if len(bbe_i.roconst) != len(bbe_j.roconst):
                continue
            ri = np.array(bbe_i.roconst)
            rj = np.array(bbe_j.roconst)
            avg = 0.5 * (ri + rj)
            nonzero = avg > 0
            if np.any(nonzero):
                ro_diff = np.max(np.abs(ri[nonzero] - rj[nonzero]) / avg[nonzero])
            else:
                # Both structures have all-zero rotational constants
                # (e.g. single atoms) — treat as matching
                ro_diff = 0.0
            if ro_diff >= ro_cutoff:
                continue

            # Kabsch RMSD gate (most expensive): only run if e and ro both passed.
            if rmsd_cutoff is not None:
                if not (hasattr(bbe_i, "cartesians") and hasattr(bbe_j, "cartesians")):
                    continue
                coords_i = np.array(bbe_i.cartesians)
                coords_j = np.array(bbe_j.cartesians)
                if coords_i.shape != coords_j.shape or len(coords_i) == 0:
                    continue
                if kabsch_rmsd(coords_i, coords_j) >= rmsd_cutoff:
                    continue

            dup_list.append([files[i], files[j]])
    return dup_list



SORT_KEYS = {
    'energy': 'scf_energy',
    'gibbs': 'qh_gibbs_free_energy',
}



[docs]
def sort_thermo(thermo_data, key):
    """Return thermo_data reordered by the given energy attribute (lowest first).

    Entries with linear_warning, missing the attribute, or with a None value are placed at the end.

    Parameters:
        thermo_data (dict): file path → calc_bbe mapping.
        key (str): sort mode — 'energy' (scf_energy) or 'gibbs' (qh_gibbs_free_energy).

    Returns:
        dict: new dict with the same items in sorted order.
    """
    attr = SORT_KEYS[key]
    inf = float('inf')

    def sort_val(item):
        """
        Compute the sort key for a thermochemistry mapping item, treating flagged or missing values as infinite so they sort last.
        
        Parameters:
            item (tuple): A (key, calc_bbe) pair where `calc_bbe` is the object containing thermochemical attributes.
        
        Returns:
            float: The attribute value `getattr(calc_bbe, attr)` when present and not None; `float('inf')` if `calc_bbe.linear_warning` is truthy or the attribute is missing/None.
        """
        bbe = item[1]
        if getattr(bbe, 'linear_warning', False):
            return inf
        val = getattr(bbe, attr, None)
        return val if val is not None else inf

    return dict(sorted(thermo_data.items(), key=sort_val))