Source code for goodvibes.io

# -*- coding: utf-8 -*-
import json
import os.path
import re
from dataclasses import asdict, dataclass, field
from datetime import datetime
from typing import List, Optional

import numpy as np




[docs]
@dataclass
class QCData:
    """Program-agnostic container for parsed quantum chemistry data.

    Populated by parse_qcdata() in io.py. Consumed by calc_bbe in thermo.py.
    """
    # Provenance
    file: str = ''
    program: str = ''
    version_program: str = ''
    job_type: str = ''

    # Electronic structure
    scf_energy: Optional[float] = None
    charge: Optional[int] = None
    multiplicity: int = 1

    # Model chemistry metadata
    solvation_model: str = ''
    empirical_dispersion: str = ''

    # Molecular properties
    molecular_mass: float = 0.0
    symmno: int = 1
    linear_mol: bool = False
    point_group: str = ''

    # Rotational data
    roconst: List[float] = field(default_factory=lambda: [0.0, 0.0, 0.0])
    rotemp: List[float] = field(default_factory=lambda: [0.0, 0.0, 0.0])
    linear_warning: bool = False

    # Frequencies (raw from parser — positive and negative separated)
    frequency_wn: List[float] = field(default_factory=list)
    im_frequency_wn: List[float] = field(default_factory=list)

    # Thermal corrections
    zero_point_corr: Optional[float] = None

    # CPU time [days, hours, mins, secs, msecs]
    cpu: List[int] = field(default_factory=lambda: [0, 0, 0, 0, 0])

    # Number of MPI processes / cores reported by the QC program. Used by
    # the ORCA parser to convert TOTAL RUN TIME (wall) into an effective
    # CPU-time estimate (wall × nprocs); 1 elsewhere.
    nprocs: int = 1

    # ONIOM MM frequency scaling fractions (per-frequency, empty if not ONIOM)
    fract_modelsys: List[float] = field(default_factory=list)
    has_oniom: bool = False

    # Molecular geometry
    atom_nums: List[int] = field(default_factory=list)
    atom_types: List[str] = field(default_factory=list)
    cartesians: List[List[float]] = field(default_factory=list)

    # Frequency scaling factor applied by QC program (ORCA only; 1.0 means unscaled)
    applied_freq_scale_factor: float = 1.0

    # Single-point-correction (SPC) cache. Populated by calc_bbe when --spc
    # is used so the SPC file isn't re-parsed on subsequent --import runs.
    # `sp_suffix` is the --spc value that produced the cached numbers; on
    # a re-run with the same suffix the cache is reused, on a different
    # suffix it's bypassed and refreshed. `sp_file` is the resolved
    # absolute path of the SPC output (diagnostic only).
    sp_energy: Optional[float] = None
    sp_version_program: str = ''
    sp_solvation_model: str = ''
    sp_charge: Optional[int] = None
    sp_empirical_dispersion: str = ''
    sp_multiplicity: Optional[int] = None
    sp_suffix: str = ''
    sp_file: str = ''



# Cache version for JSON serialization format
CACHE_VERSION = 1



[docs]
def qcdata_to_dict(qcdata):
    """Convert a QCData instance to a JSON-serializable dictionary."""
    d = asdict(qcdata)
    d['_cache_version'] = CACHE_VERSION
    return d




[docs]
def dict_to_qcdata(d):
    """Reconstruct a QCData instance from a dictionary."""
    clean = {k: v for k, v in d.items() if not k.startswith('_')}
    return QCData(**clean)




[docs]
def save_cache(cache_dict, path):
    """Write QCData cache to a JSON file.

    Parameters
    ----------
    cache_dict : dict
        Mapping of {basename_key: qcdata_dict} where each value is from qcdata_to_dict().
    path : str
        Output JSON file path.
    """
    envelope = {
        '_cache_version': CACHE_VERSION,
        '_created': datetime.now().isoformat(),
        'entries': cache_dict,
    }
    with open(path, 'w') as f:
        json.dump(envelope, f, indent=2)




[docs]
def load_cache(path):
    """Load a QCData cache from a JSON file.

    Auto-detects the format. v1.0+ payloads (the unified schema written
    by `--export` / `--json`) are read by extracting each result's
    `qcdata` block; the legacy cache envelope (`{_cache_version, entries}`,
    written by pre-v5.0 `--cache-save`) is also accepted for back-compat.

    Parameters
    ----------
    path : str
        Path to JSON file (either v1.0 unified schema or legacy cache).

    Returns
    -------
    dict
        Mapping of {basename_key: QCData}.
    """
    with open(path, 'r') as f:
        data = json.load(f)
    if 'schema_version' in data:
        # Unified v1.0+ payload — use each result's qcdata block.
        from .schema import validate
        validate(data)
        cache = {}
        for entry in data.get('results', []):
            qc = entry.get('qcdata')
            if qc is None:
                continue
            name = entry.get('name')
            if name is None:
                # Fall back to deriving from the file path.
                fp = entry.get('file', '')
                name = os.path.splitext(os.path.basename(fp))[0]
            cache[name] = dict_to_qcdata(qc)
        return cache
    # Legacy envelope: {_cache_version, _created, entries}
    version = data.get('_cache_version', 0)
    if version != CACHE_VERSION:
        raise ValueError(
            "Cache version mismatch: expected {} (legacy) or a v1.0+ "
            "schema_version, got _cache_version={}".format(CACHE_VERSION, version)
        )
    return {key: dict_to_qcdata(val) for key, val in data['entries'].items()}



# PHYSICAL CONSTANTS                                      UNITS
KCAL_TO_AU = 627.509541  # UNIT CONVERSION

# Radii used to determine connectivity in symmetry corrections
# Covalent radii taken from Cambridge Structural Database
RADII = {'H': 0.32, 'He': 0.93, 'Li': 1.23, 'Be': 0.90, 'B': 0.82, 'C': 0.77, 'N': 0.75, 'O': 0.73, 'F': 0.72,
         'Ne': 0.71, 'Na': 1.54, 'Mg': 1.36, 'Al': 1.18, 'Si': 1.11, 'P': 1.06, 'S': 1.02, 'Cl': 0.99, 'Ar': 0.98,
         'K': 2.03, 'Ca': 1.74, 'Sc': 1.44, 'Ti': 1.32, 'V': 1.22, 'Cr': 1.18, 'Mn': 1.17, 'Fe': 1.17, 'Co': 1.16,
         'Ni': 1.15, 'Cu': 1.17, 'Zn': 1.25, 'Ga': 1.26, 'Ge': 1.22, 'As': 1.20, 'Se': 1.16, 'Br': 1.14, 'Kr': 1.12,
         'Rb': 2.16, 'Sr': 1.91, 'Y': 1.62, 'Zr': 1.45, 'Nb': 1.34, 'Mo': 1.30, 'Tc': 1.27, 'Ru': 1.25, 'Rh': 1.25,
         'Pd': 1.28, 'Ag': 1.34, 'Cd': 1.48, 'In': 1.44, 'Sn': 1.41, 'Sb': 1.40, 'Te': 1.36, 'I': 1.33, 'Xe': 1.31,
         'Cs': 2.35, 'Ba': 1.98, 'La': 1.69, 'Lu': 1.60, 'Hf': 1.44, 'Ta': 1.34, 'W': 1.30, 'Re': 1.28, 'Os': 1.26,
         'Ir': 1.27, 'Pt': 1.30, 'Au': 1.34, 'Hg': 1.49, 'Tl': 1.48, 'Pb': 1.47, 'Bi': 1.46, 'X': 0}
# Bondi van der Waals radii for all atoms from: Bondi, A. J. Phys. Chem. 1964, 68, 441-452,
# except hydrogen, which is taken from Rowland, R. S.; Taylor, R. J. Phys. Chem. 1996, 100, 7384-7391.
# Radii unavailable in either of these publications are set to 2 Angstrom
# (Unfinished)
BONDI = {'H': 1.09, 'He': 1.40, 'Li': 1.82, 'Be': 2.00, 'B': 2.00, 'C': 1.70, 'N': 1.55, 'O': 1.52, 'F': 1.47,
         'Ne': 1.54}

# Some useful arrays
periodictable = ["", "H", "He", "Li", "Be", "B", "C", "N", "O", "F", "Ne", "Na", "Mg", "Al", "Si",
                 "P", "S", "Cl", "Ar", "K", "Ca", "Sc", "Ti", "V", "Cr", "Mn", "Fe", "Co", "Ni", "Cu", "Zn",
                 "Ga", "Ge", "As", "Se", "Br", "Kr", "Rb", "Sr", "Y", "Zr", "Nb", "Mo", "Tc", "Ru", "Rh", "Pd",
                 "Ag", "Cd", "In", "Sn", "Sb", "Te", "I", "Xe", "Cs", "Ba", "La", "Ce", "Pr", "Nd", "Pm", "Sm",
                 "Eu", "Gd", "Tb", "Dy", "Ho", "Er", "Tm", "Yb", "Lu", "Hf", "Ta", "W", "Re", "Os", "Ir", "Pt",
                 "Au", "Hg", "Tl", "Pb", "Bi", "Po", "At", "Rn", "Fr", "Ra", "Ac", "Th", "Pa", "U", "Np", "Pu",
                 "Am", "Cm", "Bk", "Cf", "Es", "Fm", "Md", "No", "Lr", "Rf", "Db", "Sg", "Bh", "Hs", "Mt", "Ds",
                 "Rg", "Uub", "Uut", "Uuq", "Uup", "Uuh", "Uus", "Uuo"]


[docs]
def element_id(massno, num=False):
    """
    Get element symbol from mass number.

    Used in parsing output files to determine elements present in file.

    Parameter:
    massno (int): mass of element.

    Returns:
    str: element symbol, or 'XX' if not found in periodic table.
    """
    try:
        if num:
            return periodictable.index(massno)
        return periodictable[massno]
    except IndexError:
        return "XX"



# Standard atomic weights (IUPAC 2021), most common isotope where applicable.
# Used by parse_ase_thermo when an extxyz fixture omits molecular_mass / rotational
# constants and they must be computed from the geometry.
ATOMIC_MASSES = {
    'H': 1.00782503207, 'He': 4.002602,
    'Li': 7.0160034366, 'Be': 9.012183065, 'B': 11.00930536, 'C': 12.0,
    'N': 14.0030740048, 'O': 15.99491461957, 'F': 18.998403163, 'Ne': 19.9924401762,
    'Na': 22.9897692820, 'Mg': 23.985041697, 'Al': 26.98153853, 'Si': 27.97692653465,
    'P': 30.97376199842, 'S': 31.9720711744, 'Cl': 34.968852682, 'Ar': 39.9623831237,
    'K': 38.96370648, 'Ca': 39.96259098, 'Sc': 44.95590828, 'Ti': 47.94794198,
    'V': 50.94395704, 'Cr': 51.94050623, 'Mn': 54.93804451, 'Fe': 55.93493633,
    'Co': 58.93319429, 'Ni': 57.93534241, 'Cu': 62.92959772, 'Zn': 63.92914201,
    'Ga': 68.9255735, 'Ge': 73.921177761, 'As': 74.92159457, 'Se': 79.9165218,
    'Br': 78.9183376, 'Kr': 83.9114977282,
    'Rb': 84.91178974, 'Sr': 87.9056125, 'Y': 88.9058403, 'Zr': 89.9046977,
    'Nb': 92.906373, 'Mo': 97.90540482, 'Tc': 98.9062508, 'Ru': 101.9043441,
    'Rh': 102.905498, 'Pd': 105.9034804, 'Ag': 106.9050916, 'Cd': 113.90336509,
    'In': 114.903878776, 'Sn': 119.90220163, 'Sb': 120.903812, 'Te': 129.906222748,
    'I': 126.9044719, 'Xe': 131.9041550856,
}


def _principal_moments_of_inertia(atom_types, cartesians):
    """Return (total_mass [amu], [I_a, I_b, I_c] in amu*Angstrom^2).

    Used as a fallback when an extxyz fixture omits molecular_mass / roconst /
    rotemp; computes them from the geometry using ATOMIC_MASSES.
    """
    masses = np.array([ATOMIC_MASSES.get(a, 0.0) for a in atom_types])
    coords = np.array(cartesians, dtype=float)
    total = float(masses.sum())
    if total <= 0.0 or len(coords) == 0:
        return total, np.array([0.0, 0.0, 0.0])
    com = (masses[:, None] * coords).sum(axis=0) / total
    r = coords - com
    Ixx = float((masses * (r[:, 1] ** 2 + r[:, 2] ** 2)).sum())
    Iyy = float((masses * (r[:, 0] ** 2 + r[:, 2] ** 2)).sum())
    Izz = float((masses * (r[:, 0] ** 2 + r[:, 1] ** 2)).sum())
    Ixy = float(-(masses * r[:, 0] * r[:, 1]).sum())
    Ixz = float(-(masses * r[:, 0] * r[:, 2]).sum())
    Iyz = float(-(masses * r[:, 1] * r[:, 2]).sum())
    inertia = np.array([[Ixx, Ixy, Ixz], [Ixy, Iyy, Iyz], [Ixz, Iyz, Izz]])
    eigvals = np.linalg.eigvalsh(inertia)
    eigvals = np.sort(np.maximum(eigvals, 0.0))
    return total, eigvals

def _fill_mass_and_rotemp_from_geometry(qcdata):
    """Populate molecular_mass and rotational constants from geometry.

    Fallback used when a QC program omits the THERMOCHEMISTRY block
    (e.g. Gaussian freq-without-opt, ORCA --hess-only) — recomputes from
    atom_types + cartesians via ATOMIC_MASSES so that downstream
    translational/rotational entropy doesn't see mass=0 or rotemp=[0,0,0].
    No-op when the parser already populated the fields.
    """
    if not (qcdata.atom_types and qcdata.cartesians):
        return
    needs_mass = qcdata.molecular_mass == 0.0
    needs_rotemp = qcdata.rotemp == [0.0, 0.0, 0.0]
    if not (needs_mass or needs_rotemp):
        return
    total_mass, eigvals = _principal_moments_of_inertia(
        qcdata.atom_types, qcdata.cartesians)
    if needs_mass:
        qcdata.molecular_mass = total_mass
    if needs_rotemp:
        HC_OVER_KB = 1.4387768775  # K per cm⁻¹
        roconst_cm = [16.857629 / I if I > 1e-3 else 0.0 for I in eigvals]
        qcdata.roconst = [b * 29.9792458 for b in roconst_cm]
        qcdata.rotemp = [HC_OVER_KB * b for b in roconst_cm]



[docs]
def compute_connectivity(atom_types, cartesians, tolerance=0.2):
    """Compute molecular connectivity based on covalent radii."""
    connectivity = []
    for i, ai in enumerate(atom_types):
        row = []
        for j, aj in enumerate(atom_types):
            if i == j:
                continue
            cutoff = RADII[ai] + RADII[aj] + tolerance
            distance = np.linalg.norm(np.array(cartesians[i]) - np.array(cartesians[j]))
            if distance < cutoff:
                row.append(j)
        connectivity.append(row)
    return connectivity



[docs]
def write_xyz(filepath, files, thermo_data):
    """Write optimized Cartesian coordinates to a multi-structure .xyz file.

    Each structure block contains the atom count, a comment line with the
    filename and SCF energy, and the Cartesian coordinates.

    Parameters:
        filepath (str): output .xyz file path.
        files (list): output file paths whose coordinates to write.
        thermo_data (dict): file path → calc_bbe mapping.
    """
    from .utils import display_name
    with open(filepath, 'w') as f:
        for file in files:
            bbe = thermo_data[file]
            f.write(f'{len(bbe.atom_types)}\n')
            if bbe.scf_energy is not None:
                f.write(f'{display_name(file):<39} {"Eopt":>13} {bbe.scf_energy:.6f}\n')
            else:
                f.write(f'{display_name(file):<39}\n')
            if bbe.atom_types and bbe.cartesians:
                for atom, carts in zip(bbe.atom_types, bbe.cartesians):
                    f.write(f'{atom:>1}{carts[0]:13.6f}{carts[1]:13.6f}{carts[2]:13.6f}\n')



[docs]
def find_spc_file(name, spc):
    """Locate the single-point-correction output paired with ``name``.

    Pattern matched: ``{name}_{spc}.{log,out}`` — exact match. ``--spc TZVP``
    finds ``filename_TZVP.log`` only; it will NOT match ``filename_def2_TZVP.log``.
    To pair with a file like ``filename_def2_TZVP.log``, pass the full suffix
    (``--spc def2_TZVP``).

    Parameters
    ----------
    name : str
        Path stem of the parent output file (no extension).
    spc : str
        Suffix passed via ``--spc``.

    Returns
    -------
    str or None
        Path to the matching file, or None if nothing matches.
    """
    for ext in ('.log', '.out'):
        candidate = f'{name}_{spc}{ext}'
        if os.path.isfile(candidate):
            return candidate
    return None




[docs]
def parse_data(file):
    """
    Read computational chemistry output file.

    Attempt to obtain single point energy, program type, program version, solvation_model,
    charge, empirical_dispersion, and multiplicity from file.

    Parameter:
    file (str): name of file to be parsed.

    Returns:
    float: single point energy.
    str: program used to run calculation.
    str: version of program used to run calculation.
    str: solvation model used in chemical calculation (if any).
    str: original filename parsed.
    int: overall charge of molecule or chemical system.
    str: empirical dispersion used in chemical calculation (if any).
    int: multiplicity of molecule or chemical system.
    """
    spe, program, data, version_program, solvation_model, keyword_line, a, charge, multiplicity = 'none', 'none', [], '', '', '', 0, None, None

    data = None
    stub = os.path.splitext(file)[0]
    possible_filenames = (stub + ".log", stub + ".out", stub + ".extxyz", file)
    actual_file = file
    for possible_filename in possible_filenames:
        if os.path.exists(possible_filename):
            actual_file = possible_filename
            with open(possible_filename, encoding='utf-8', errors='replace') as f:
                data = f.readlines()
            break

    if data is None:
        raise ValueError("File {} does not exist".format(file))

    # ASE extxyz: delegate to parse_ase_thermo and translate to parse_data's
    # tuple shape; ASE files have no separate single-point semantics.
    if len(data) >= 2 and 'program=ase' in data[1]:
        q = parse_ase_thermo(actual_file)
        return (q.scf_energy if q.scf_energy is not None else 'none',
                'ase',
                q.version_program,
                q.solvation_model or 'gas phase',
                file,
                q.charge if q.charge is not None else 0,
                q.empirical_dispersion or 'No empirical dispersion detected',
                q.multiplicity)

    # Q-Chem: delegate to parse_qchem_thermo (Q-Chem energy lines, MP2/CCSD
    # precedence, and $molecule echo are all already handled there).
    if any('You are running Q-Chem' in ln or 'Welcome to Q-Chem' in ln for ln in data[:80]):
        q = parse_qchem_thermo(actual_file)
        return (q.scf_energy if q.scf_energy is not None else 'none',
                'QChem',
                q.version_program,
                q.solvation_model or 'gas phase',
                file,
                q.charge if q.charge is not None else 0,
                q.empirical_dispersion or 'No empirical dispersion detected',
                q.multiplicity)

    for line in data:
        if "Gaussian" in line:
            program = "Gaussian"
            break
        if "* O   R   C   A *" in line:
            program = "Orca"
            break
        if "NWChem" in line:
            program = "NWChem"
            break
        if "x T B" in line or "xtb version" in line:
            program = "xtb"
            break
    repeated_link1 = 0
    freq_started = False  # Guard against VPT2 displaced geometry energies
    zero_point_corr_G4 = 0.0

    for line in data:
        if program == "Gaussian":
            # Reset freq_started at each new link (linked jobs have separate links)
            if 'Normal termination' in line:
                freq_started = False
            if line.strip().startswith('Frequencies -- '):
                freq_started = True
            if not freq_started and line.strip().startswith('SCF Done:'):
                spe = float(line.strip().split()[4])
            elif not freq_started and line.strip().startswith('E2('):
                spe_value = line.strip().split()[-1]
                spe = float(spe_value.replace('D','E'))
            elif not freq_started and 'EUMP2 =' in line.strip():
                spe = float((line.strip().split()[5]).replace('D', 'E'))
            elif 'CCSD(T)=' in line.strip():
                raw = line.strip().split('CCSD(T)=')[1].split('\\')[0].split()[0]
                spe = float(raw.replace('D', 'E'))
            elif 'CCSD=' in line.strip() and 'CCSD(T)' not in line.strip():
                raw = line.strip().split('CCSD=')[1].split('\\')[0].split()[0]
                spe = float(raw.replace('D', 'E'))
            elif line.strip().startswith('Counterpoise corrected energy'):
                spe = float(line.strip().split()[4])
            # For ONIOM calculations use the extrapolated value rather than SCF value
            elif "ONIOM: extrapolated energy" in line.strip():
                spe = (float(line.strip().split()[4]))
            # For G4 calculations look for G4 energies (Gaussian16a bug prints G4(0 K) as DE(HF)) --Brian modified to work for G16c-where bug is fixed.
            elif line.strip().startswith('G4(0 K)'):
                spe = float(line.strip().split()[2])
                spe -= zero_point_corr_G4 #Remove G4 ZPE
            elif line.strip().startswith('E(ZPE)='): #Get G4 ZPE
                zero_point_corr_G4 = float(line.strip().split()[1])
            # For TD calculations look for SCF energies of the first excited state
            elif 'E(TD-HF/TD-DFT)' in line.strip():
                spe = float(line.strip().split()[4])
            # For Semi-empirical or Molecular Mechanics calculations
            elif "Energy= " in line.strip() and "Predicted" not in line.strip() and "Thermal" not in line.strip() and "G4" not in line.strip():
                spe = (float(line.strip().split()[1]))
            elif "Gaussian" in line and "Revision" in line and repeated_link1 == 0:
                for i in range(len(line.strip(",").split(",")) - 1):
                    line.strip(",").split(",")[i]
                    version_program += line.strip(",").split(",")[i]
                    repeated_link1 = 1
                version_program = version_program[1:]
            # Charge and multiplicity
            elif 'Charge' in line and 'Multiplicity' in line:
                try:
                    parts = line.split()
                    charge = int(parts[parts.index('Charge') + 2])
                    multiplicity = int(parts[parts.index('Multiplicity') + 2])
                except (ValueError, IndexError):
                    pass
        elif program == "Orca":
            if 'Program Version' in line.strip():
                version_program = "ORCA version " + line.split()[2]
            if line.strip().startswith('FINAL SINGLE POINT ENERGY'):
                spe = float(line.strip().split()[-1])
            if "Total Charge" in line.strip() and "...." in line.strip():
                charge = int(line.strip("=").split()[-1])
            if "Multiplicity" in line.strip() and "...." in line.strip():
                multiplicity = int(line.strip("=").split()[-1])
        elif program == "NWChem":
            if 'nwchem branch' in line.strip():
                version_program = "NWChem version " + line.split()[3]
            if line.strip().startswith('Total DFT energy'):
                spe = float(line.strip().split()[4])
            if "charge" in line.strip():
                charge = int(line.strip().split()[-1])
            if "mult " in line.strip():
                multiplicity = int(line.strip().split()[-1])
            elif "Spin multiplicity:" in line:
                try:
                    multiplicity = int(line.split(":", 1)[1].strip().split()[0])
                except (ValueError, IndexError):
                    pass
        elif program == "xtb":
            stripped = line.strip()
            if 'xtb version' in stripped and not version_program:
                parts = stripped.split()
                try:
                    version_program = "xtb version " + parts[parts.index('version') + 1]
                except (ValueError, IndexError):
                    pass
            if stripped.startswith(':: total energy'):
                parts = stripped.split()
                try:
                    spe = float(parts[3])
                except (ValueError, IndexError):
                    pass
            if 'net charge' in stripped and stripped.startswith(':'):
                parts = stripped.split()
                try:
                    charge = int(parts[parts.index('charge') + 1])
                except (ValueError, IndexError):
                    pass
            if 'unpaired electrons' in stripped and stripped.startswith(':'):
                parts = stripped.split()
                try:
                    multiplicity = int(parts[parts.index('electrons') + 1]) + 1
                except (ValueError, IndexError):
                    pass

    # Solvation model and empirical dispersion detection
    if 'Gaussian' in version_program.strip():
        for i, line in enumerate(data):
            if '#' in line.strip() and a == 0:
                for j, line in enumerate(data[i:i + 10]):
                    if '--' in line.strip():
                        a = a + 1
                        break
                    if a != 0:
                        break
                    else:
                        for k in range(len(line.strip().split("\n"))):
                            line.strip().split("\n")[k]
                            keyword_line += line.strip().split("\n")[k]
        keyword_line = keyword_line.lower()
        if 'scrf' not in keyword_line.strip():
            solvation_model = "gas phase"
        else:
            start_scrf = keyword_line.strip().find('scrf') + 4
            if '(' in keyword_line[start_scrf:start_scrf + 4]:
                start_scrf += keyword_line[start_scrf:start_scrf + 4].find('(') + 1
                end_scrf = keyword_line.find(")", start_scrf)
                display_solvation_model = "scrf=(" + ','.join(
                    keyword_line[start_scrf:end_scrf].lower().split(',')) + ')'
                sorted_solvation_model = "scrf=(" + ','.join(
                    sorted(keyword_line[start_scrf:end_scrf].lower().split(','))) + ')'
            else:
                if ' = ' in keyword_line[start_scrf:start_scrf + 4]:
                    start_scrf += keyword_line[start_scrf:start_scrf + 4].find(' = ') + 3
                elif ' =' in keyword_line[start_scrf:start_scrf + 4]:
                    start_scrf += keyword_line[start_scrf:start_scrf + 4].find(' =') + 2
                elif '=' in keyword_line[start_scrf:start_scrf + 4]:
                    start_scrf += keyword_line[start_scrf:start_scrf + 4].find('=') + 1
                end_scrf = keyword_line.find(" ", start_scrf)
                if end_scrf == -1:
                    display_solvation_model = "scrf=(" + ','.join(keyword_line[start_scrf:].lower().split(',')) + ')'
                    sorted_solvation_model = "scrf=(" + ','.join(
                        sorted(keyword_line[start_scrf:].lower().split(','))) + ')'
                else:
                    display_solvation_model = "scrf=(" + ','.join(
                        keyword_line[start_scrf:end_scrf].lower().split(',')) + ')'
                    sorted_solvation_model = "scrf=(" + ','.join(
                        sorted(keyword_line[start_scrf:end_scrf].lower().split(','))) + ')'
        if solvation_model != "gas phase":
            solvation_model = [sorted_solvation_model, display_solvation_model]
        empirical_dispersion = ''
        if keyword_line.strip().find('empiricaldispersion') == -1 and keyword_line.strip().find(
                'emp=') == -1 and keyword_line.strip().find('emp =') == -1 and keyword_line.strip().find('emp(') == -1:
            empirical_dispersion = "No empirical dispersion detected"
        elif keyword_line.strip().find('empiricaldispersion') > -1:
            start_emp_disp = keyword_line.strip().find('empiricaldispersion') + 19
            if '(' in keyword_line[start_emp_disp:start_emp_disp + 4]:
                start_emp_disp += keyword_line[start_emp_disp:start_emp_disp + 4].find('(') + 1
                end_emp_disp = keyword_line.find(")", start_emp_disp)
                empirical_dispersion = 'empiricaldispersion=(' + ','.join(
                    sorted(keyword_line[start_emp_disp:end_emp_disp].lower().split(','))) + ')'
            else:
                if ' = ' in keyword_line[start_emp_disp:start_emp_disp + 4]:
                    start_emp_disp += keyword_line[start_emp_disp:start_emp_disp + 4].find(' = ') + 3
                elif ' =' in keyword_line[start_emp_disp:start_emp_disp + 4]:
                    start_emp_disp += keyword_line[start_emp_disp:start_emp_disp + 4].find(' =') + 2
                elif '=' in keyword_line[start_emp_disp:start_emp_disp + 4]:
                    start_emp_disp += keyword_line[start_emp_disp:start_emp_disp + 4].find('=') + 1
                end_emp_disp = keyword_line.find(" ", start_emp_disp)
                if end_emp_disp == -1:
                    empirical_dispersion = "empiricaldispersion=(" + ','.join(
                        sorted(keyword_line[start_emp_disp:].lower().split(','))) + ')'
                else:
                    empirical_dispersion = "empiricaldispersion=(" + ','.join(
                        sorted(keyword_line[start_emp_disp:end_emp_disp].lower().split(','))) + ')'
        elif keyword_line.strip().find('emp=') > -1 or keyword_line.strip().find(
                'emp =') > -1 or keyword_line.strip().find('emp(') > -1:
            # Check for temp keyword
            temp, emp_e, emp_p = False, False, False
            check_temp = keyword_line.strip().find('emp=')
            start_emp_disp = keyword_line.strip().find('emp=')
            if check_temp == -1:
                check_temp = keyword_line.strip().find('emp =')
                start_emp_disp = keyword_line.strip().find('emp =')
            if check_temp == -1:
                check_temp = keyword_line.strip().find('emp=(')
                start_emp_disp = keyword_line.strip().find('emp(')
            check_temp += -1
            if keyword_line[check_temp].lower() == 't':
                temp = True  # Look for a new one
                if keyword_line.strip().find('emp=', check_temp + 5) > -1:
                    emp_e = True
                    start_emp_disp = keyword_line.strip().find('emp=', check_temp + 5) + 3
                elif keyword_line.strip().find('emp =', check_temp + 5) > -1:
                    emp_e = True
                    start_emp_disp = keyword_line.strip().find('emp =', check_temp + 5) + 3
                elif keyword_line.strip().find('emp(', check_temp + 5) > -1:
                    emp_p = True
                    start_emp_disp = keyword_line.strip().find('emp(', check_temp + 5) + 3
                else:
                    empirical_dispersion = "No empirical dispersion detected"
            else:
                start_emp_disp += 3
            if (temp and emp_e) or (not temp and keyword_line.strip().find('emp=') > -1) or (
                    not temp and keyword_line.strip().find('emp =')):
                if '(' in keyword_line[start_emp_disp:start_emp_disp + 4]:
                    start_emp_disp += keyword_line[start_emp_disp:start_emp_disp + 4].find('(') + 1
                    end_emp_disp = keyword_line.find(")", start_emp_disp)
                    empirical_dispersion = 'empiricaldispersion=(' + ','.join(
                        sorted(keyword_line[start_emp_disp:end_emp_disp].lower().split(','))) + ')'
                else:
                    if ' = ' in keyword_line[start_emp_disp:start_emp_disp + 4]:
                        start_emp_disp += keyword_line[start_emp_disp:start_emp_disp + 4].find(' = ') + 3
                    elif ' =' in keyword_line[start_emp_disp:start_emp_disp + 4]:
                        start_emp_disp += keyword_line[start_emp_disp:start_emp_disp + 4].find(' =') + 2
                    elif '=' in keyword_line[start_emp_disp:start_emp_disp + 4]:
                        start_emp_disp += keyword_line[start_emp_disp:start_emp_disp + 4].find('=') + 1
                    end_emp_disp = keyword_line.find(" ", start_emp_disp)
                    if end_emp_disp == -1:
                        empirical_dispersion = "empiricaldispersion=(" + ','.join(
                            sorted(keyword_line[start_emp_disp:].lower().split(','))) + ')'
                    else:
                        empirical_dispersion = "empiricaldispersion=(" + ','.join(
                            sorted(keyword_line[start_emp_disp:end_emp_disp].lower().split(','))) + ')'
            elif (temp and emp_p) or (not temp and keyword_line.strip().find('emp(') > -1):
                start_emp_disp += keyword_line[start_emp_disp:start_emp_disp + 4].find('(') + 1
                end_emp_disp = keyword_line.find(")", start_emp_disp)
                empirical_dispersion = 'empiricaldispersion=(' + ','.join(
                    sorted(keyword_line[start_emp_disp:end_emp_disp].lower().split(','))) + ')'
    if 'ORCA' in version_program.strip():
        keyword_line_1 = "gas phase"
        keyword_line_2 = ''
        keyword_line_3 = ''
        for i, line in enumerate(data):
            if 'CPCM SOLVATION MODEL' in line.strip():
                keyword_line_1 = "CPCM,"
            if 'SMD CDS free energy correction energy' in line.strip():
                keyword_line_2 = "SMD,"
            if "Solvent:              " in line.strip():
                keyword_line_3 = line.strip().split()[-1]
        solvation_model = keyword_line_1 + keyword_line_2 + keyword_line_3
        empirical_dispersion1 = 'No empirical dispersion detected'
        empirical_dispersion2 = ''
        empirical_dispersion3 = ''
        for i, line in enumerate(data):
            if keyword_line.strip().find('DFT DISPERSION CORRECTION') > -1:
                empirical_dispersion1 = ''
            if keyword_line.strip().find('DFTD3') > -1:
                empirical_dispersion2 = "D3"
            if keyword_line.strip().find('USING zero damping') > -1:
                empirical_dispersion3 = ' with zero damping'
        empirical_dispersion = empirical_dispersion1 + empirical_dispersion2 + empirical_dispersion3
    if 'NWChem' in version_program.strip():
        empirical_dispersion1 = 'No empirical dispersion detected'
        empirical_dispersion2 = ''
        empirical_dispersion3 = ''
        for i, line in enumerate(data):
            if keyword_line.strip().find('Dispersion correction') > -1:
                empirical_dispersion1 = ''
            if keyword_line.strip().find('disp vdw 3') > -1:
                empirical_dispersion2 = "D3"
            if keyword_line.strip().find('disp vdw 4') > -1:
                empirical_dispersion2 = "D3BJ"
        empirical_dispersion = empirical_dispersion1 + empirical_dispersion2 + empirical_dispersion3
    if 'xtb' in version_program.strip():
        solvation_model = 'gas phase'
        for line in data:
            stripped = line.strip()
            if 'program call' in stripped and ':' in stripped:
                solvation_model = _xtb_solvation_from_call(stripped.split(':', 1)[1].strip())
                break
        empirical_dispersion = 'No empirical dispersion detected'

    return spe, program, version_program, solvation_model, file, charge, empirical_dispersion, multiplicity



[docs]
def sp_cpu(file):
    """Read single-point output for CPU time.

    Delegates to ``parse_qcdata(file).cpu`` so the SPC CPU is identical
    to what a standalone parse of the same file would report. This
    guarantees that ``goodvibes <parents> --spc <suffix>`` totals
    correctly reflect the SPC files' CPU — including program-specific
    accumulation (Gaussian sums multi-step ``Job cpu time`` lines) and
    nprocs scaling (ORCA reports wall time only; the parser scales by
    parallel-MPI process count to estimate effective CPU).
    """
    candidates = [
        os.path.splitext(file)[0] + '.log',
        os.path.splitext(file)[0] + '.out',
    ]
    for cand in candidates:
        if os.path.exists(cand):
            return parse_qcdata(cand).cpu
    raise ValueError("File {} does not exist".format(file))




# Tokens on the ORCA ``!`` keyword line that are NOT method or basis set.
_ORCA_SKIP_TOKENS = {
    # Job types
    'OPT', 'OPTTS', 'FREQ', 'NUMFREQ', 'NUMHESS', 'ANFREQ', 'SP', 'NMR',
    'VPT2', 'SCANTS', 'NEB-TS', 'FAST-NEB-TS', 'NEB-CI',
    # Solvation
    'CPCM', 'SMD',
    # Dispersion (standalone keywords, not embedded in functional name)
    'D3BJ', 'D3', 'D4', 'D3ZERO',
    # SCF convergence
    'TIGHTSCF', 'NORMALSCF', 'LOOSESCF', 'VERYTIGHTSCF', 'EXTREMESCF',
    'SLOWCONV', 'NOCONV',
    # RI approximations
    'RIJCOSX', 'RI', 'RIJK', 'RIJONX', 'AUTOAUX', 'RIJDX',
    # Grid
    'GRID4', 'GRID5', 'GRID6', 'GRID7',
    'FINALGRID4', 'FINALGRID5', 'FINALGRID6', 'FINALGRID7',
    # Output verbosity
    'PRINT', 'MINIPRINT', 'LARGEPRINT', 'PRINTBASIS', 'PRINTMOS', 'NOPRINT',
    # Reference (DFT)
    'UKS', 'RKS',
    # Other
    'NOFROZENCORE', 'FROZENCORE', 'MOREAD', 'XYZFILE',
    'SCALFREQ', 'QUASIRRHO',
}

# Reference keywords that map to HF when no other method is present
_ORCA_HF_REFS = {'UHF', 'RHF', 'ROHF'}

# Regex for recognizing basis set tokens (case-insensitive matching)
_BASIS_PATTERNS = [
    re.compile(r'^\d+-\d+\+*G', re.IGNORECASE),          # Pople: 6-31G, 6-311+G, etc.
    re.compile(r'^(AUG-)?CC-PV[DTQR56]Z$', re.IGNORECASE),  # Dunning
    re.compile(r'^DEF2-', re.IGNORECASE),                  # Karlsruhe def2-
    re.compile(r'^(SV|TZV|QZV)P?P?$', re.IGNORECASE),     # Ahlrichs without def2
    re.compile(r'^SV\(P\)$', re.IGNORECASE),               # SV(P)
]


def _is_basis_set(token):
    """Return True if *token* looks like a basis set name."""
    return any(pat.match(token) for pat in _BASIS_PATTERNS)


def _is_auxiliary_basis(token):
    """Return True if *token* is an auxiliary basis set (e.g. cc-pVTZ/C, def2/J)."""
    upper = token.upper()
    if '/' in upper:
        suffix = upper.rsplit('/', 1)[1]
        if suffix in ('C', 'J', 'JK', 'JKFIT', 'CFIT'):
            return True
        # def2/J, def2/JK style
        if upper.startswith('DEF2/'):
            return True
    return False


def _parse_orca_lot(data):
    """Extract (method, basis_set) from ORCA output file lines.

    Parses the ``!`` keyword line(s) in the ORCA input section and classifies
    tokens into method, basis set, or known keywords to skip.  Falls back to
    the ``Your calculation utilizes the basis:`` line when no basis set is
    found on the ``!`` line.
    """
    kw_tokens = []
    fallback_basis = None

    for line in data:
        stripped = line.strip()

        # Collect tokens from ORCA input keyword lines: |  N> ! ...
        # The ``!`` must be the first non-whitespace after ``>`` to avoid
        # matching ``!`` inside comments on other input lines.
        if '|' in line and '>' in stripped:
            after_angle = stripped.split('>', 1)[1].lstrip()
            if after_angle.startswith('!'):
                after_bang = after_angle.split('!', 1)[1]
                kw_tokens.extend(after_bang.split())

        # Fallback basis from ORCA output section
        if 'Your calculation utilizes the basis:' in stripped and fallback_basis is None:
            fallback_basis = stripped.split(':')[-1].strip()

    # Classify tokens
    method = None
    basis = None
    hf_ref = None  # Track UHF/RHF/ROHF in case it's the only method

    for token in kw_tokens:
        upper = token.upper()

        # Skip parallelization (pal4, pal8, pal16, ...)
        if re.match(r'^PAL\d+$', upper):
            continue

        # Skip known ORCA keywords
        if upper in _ORCA_SKIP_TOKENS:
            continue

        # Skip auxiliary basis sets
        if _is_auxiliary_basis(token):
            continue

        # Skip GCP(...) tokens
        if upper.startswith('GCP(') or upper == 'GCP':
            continue

        # Skip QM/QM2 compound job keyword
        if upper == 'QM/QM2':
            continue

        # Track HF reference keywords
        if upper in _ORCA_HF_REFS:
            hf_ref = 'HF'
            continue

        # Classify as basis set or method
        if _is_basis_set(token) and basis is None:
            basis = token
        elif method is None:
            method = token

    # Fall back: if only HF reference found and no other method
    if method is None and hf_ref is not None:
        method = hf_ref

    # Fall back to "utilizes the basis" line
    if basis is None and fallback_basis is not None:
        basis = fallback_basis

    return method or 'none', basis or 'none'



[docs]
def level_of_theory(file):
    """Read output for the level of theory and basis set used."""
    repeated_theory = 0
    with open(file, encoding='utf-8', errors='replace') as f:
        data = f.readlines()
    level, bs = 'none', 'none'

    # Detect ORCA and use dedicated parser
    for line in data:
        if '* O   R   C   A *' in line:
            level, bs = _parse_orca_lot(data)
            return '/'.join([level, bs])
        if 'Gaussian' in line:
            break
        if 'NWChem' in line:
            break
        if 'x T B' in line or 'xtb version' in line:
            for ln in data:
                if 'Hamiltonian' in ln and 'xTB' in ln:
                    parts = ln.split()
                    for tok in parts:
                        if 'xTB' in tok:
                            return tok + '/none'
            return 'GFN2-xTB/none'

    for line in data:
        if line.strip().find('External calculation') > -1:
            level, bs = 'ext', 'ext'
            break
        if '\\Freq\\' in line.strip() and repeated_theory == 0:
            try:
                level, bs = (line.strip().split("\\")[4:6])
                repeated_theory = 1
            except IndexError:
                pass
        elif '|Freq|' in line.strip() and repeated_theory == 0:
            try:
                level, bs = (line.strip().split("|")[4:6])
                repeated_theory = 1
            except IndexError:
                pass
        if '\\SP\\' in line.strip() and repeated_theory == 0:
            try:
                level, bs = (line.strip().split("\\")[4:6])
                repeated_theory = 1
            except IndexError:
                pass
        elif '|SP|' in line.strip() and repeated_theory == 0:
            try:
                level, bs = (line.strip().split("|")[4:6])
                repeated_theory = 1
            except IndexError:
                pass
        if 'DLPNO BASED TRIPLES CORRECTION' in line.strip():
            level = 'DLPNO-CCSD(T)'
        if 'Estimated CBS total energy' in line.strip():
            try:
                bs = ("Extrapol." + line.strip().split()[4])
            except IndexError:
                pass
        # Remove the restricted R or unrestricted U label
        if level[0] in ('R', 'U'):
            level = level[1:]
    level_of_theory = '/'.join([level, bs])
    return level_of_theory



[docs]
def read_initial(file):
    """At beginning of procedure, read level of theory, solvation model, and check for normal termination"""
    with open(file, encoding='utf-8', errors='replace') as f:
        data = f.readlines()
    level, bs, program, keyword_line = 'none', 'none', 'none', 'none'
    solvation_model = "gas phase"
    progress, orientation = 'Incomplete', 'Input'
    a, repeated_theory = 0, 0
    no_grid = True
    dft_used = 'F'
    grid_lookup = {1: 'sg1', 2: 'coarse', 4: 'fine', 5: 'ultrafine', 7: 'superfine'}

    for line in data:
        # Determine program
        if "Gaussian" in line:
            program = "Gaussian"
            break
        if "* O   R   C   A *" in line:
            program = "Orca"
            break
        if "NWChem" in line:
            program = "NWChem"
            break
        if "x T B" in line or "xtb version" in line:
            program = "xtb"
            break
        if "You are running Q-Chem" in line or "Welcome to Q-Chem" in line:
            program = "QChem"
            break
    # ASE extxyz: program=ase appears in line 2 (the comment line)
    if program == 'none' and len(data) >= 2 and 'program=ase' in data[1]:
        program = 'ase'
    for line in data:
        # Grab pertinent information from file
        if line.strip().find('External calculation') > -1:
            level, bs = 'ext', 'ext'
        if line.strip().find('Standard orientation:') > -1:
            orientation = 'Standard'
        if line.strip().find('IExCor=') > -1 and no_grid:
            try:
                dft_used = line.split('=')[2].split()[0]
                _ = grid_lookup[int(dft_used)]
                no_grid = False
            except (KeyError, ValueError, IndexError):
                pass
        if '\\Freq\\' in line.strip() and repeated_theory == 0:
            try:
                level, bs = (line.strip().split("\\")[4:6])
                repeated_theory = 1
            except IndexError:
                pass
        elif '|Freq|' in line.strip() and repeated_theory == 0:
            try:
                level, bs = (line.strip().split("|")[4:6])
                repeated_theory = 1
            except IndexError:
                pass
        if '\\SP\\' in line.strip() and repeated_theory == 0:
            try:
                level, bs = (line.strip().split("\\")[4:6])
                repeated_theory = 1
            except IndexError:
                pass
        elif '|SP|' in line.strip() and repeated_theory == 0:
            try:
                level, bs = (line.strip().split("|")[4:6])
                repeated_theory = 1
            except IndexError:
                pass
        if 'DLPNO BASED TRIPLES CORRECTION' in line.strip():
            level = 'DLPNO-CCSD(T)'
        if 'Estimated CBS total energy' in line.strip():
            try:
                bs = ("Extrapol." + line.strip().split()[4])
            except IndexError:
                pass
        # Remove the restricted R or unrestricted U label
        if level[0] in ('R', 'U'):
            level = level[1:]

    #NWChem specific parsing
    if program == 'NWChem':
        keyword_line_1 = "gas phase"
        keyword_line_2 = ''
        keyword_line_3 = ''
        for i, line in enumerate(data):
            if line.strip().startswith("xc "):
                level=line.strip().split()[1]
            if line.strip().startswith("* library "):
                bs = line.strip().replace("* library ",'')
            #need to update these tags for NWChem solvation later
            if 'CPCM SOLVATION MODEL' in line.strip():
                keyword_line_1 = "CPCM,"
            if 'SMD CDS free energy correction energy' in line.strip():
                keyword_line_2 = "SMD,"
            if "Solvent:              " in line.strip():
                keyword_line_3 = line.strip().split()[-1]
            #need to update NWChem keyword for error calculation
            if 'Total times' in line:
                progress = 'Normal'
            elif 'error termination' in line:
                progress = 'Error'
        solvation_model = keyword_line_1 + keyword_line_2 + keyword_line_3

    # Grab solvation models - Gaussian files
    if program == 'Gaussian':
        for i, line in enumerate(data):
            if '#' in line.strip() and a == 0:
                for j, line in enumerate(data[i:i + 10]):
                    if '--' in line.strip():
                        a = a + 1
                        break
                    if a != 0:
                        break
                    else:
                        for k in range(len(line.strip().split("\n"))):
                            keyword_line += line.strip().split("\n")[k]
            if 'Normal termination' in line:
                progress = 'Normal'
            elif 'Error termination' in line:
                progress = 'Error'
        keyword_line = keyword_line.lower()
        if 'scrf' not in keyword_line.strip():
            solvation_model = "gas phase"
        else:
            start_scrf = keyword_line.strip().find('scrf') + 5
            if start_scrf >= len(keyword_line):
                # Bare `scrf` with no body — Gaussian defaults to PCM/water.
                solvation_model = "scrf"
            elif keyword_line[start_scrf] == "(":
                end_scrf = keyword_line.find(")", start_scrf)
                solvation_model = "scrf=" + keyword_line[start_scrf:end_scrf]
                if solvation_model[-1] != ")":
                    solvation_model = solvation_model + ")"
            else:
                start_scrf2 = keyword_line.strip().find('scrf') + 4
                if keyword_line.find(" ", start_scrf) > -1:
                    end_scrf = keyword_line.find(" ", start_scrf)
                else:
                    end_scrf = len(keyword_line)
                if keyword_line[start_scrf2] == "(":
                    solvation_model = "scrf=(" + keyword_line[start_scrf:end_scrf]
                    if solvation_model[-1] != ")":
                        solvation_model = solvation_model + ")"
                else:
                    if keyword_line.find(" ", start_scrf) > -1:
                        end_scrf = keyword_line.find(" ", start_scrf)
                    else:
                        end_scrf = len(keyword_line)
                    solvation_model = "scrf=" + keyword_line[start_scrf:end_scrf]
    # xtb parsing for level of theory and solvation model
    elif program == 'xtb':
        # Hamiltonian (GFN0/1/2-xTB) → "level"; xtb has no basis set in the QC sense
        for ln in data:
            if 'Hamiltonian' in ln and 'xTB' in ln:
                for tok in ln.split():
                    if 'xTB' in tok:
                        level = tok
                        break
                if level != 'none':
                    break
        bs = 'none'
        # Solvation from program-call line
        solvation_model = 'gas phase'
        for ln in data:
            stripped = ln.strip()
            if 'program call' in stripped and ':' in stripped:
                solvation_model = _xtb_solvation_from_call(stripped.split(':', 1)[1].strip())
                break
        # Termination status
        for ln in data:
            if '[ERROR]' in ln or 'fatal error' in ln.lower():
                progress = 'Error'
                break
            if 'finished run on' in ln:
                progress = 'Normal'

    # Q-Chem: parse $rem METHOD/BASIS from the input echo, solvation from
    # the run banner, and termination from "Thank you very much" footer.
    elif program == 'QChem':
        in_rem = False
        method, basis = 'none', 'none'
        cpcm = smd = solvent_name = ''
        for line in data:
            s = line.strip()
            if s.lower() == '$rem':
                in_rem = True
                continue
            if in_rem:
                if s == '$end' or s.startswith('$'):
                    in_rem = False
                else:
                    parts = s.split()
                    if len(parts) >= 2:
                        key = parts[0].lower()
                        if key == 'method' and method == 'none':
                            method = parts[1]
                        elif key == 'basis' and basis == 'none':
                            basis = parts[1]
            if 'Using C-PCM' in s or 'C-PCM dielectric factor' in s:
                cpcm = 'CPCM'
            elif 'Using the SMD solvation model' in s:
                smd = 'SMD'
            elif s.startswith('Solvent:'):
                try:
                    solvent_name = s.split(':', 1)[1].strip()
                except IndexError:
                    pass
            if 'Thank you very much for using Q-Chem' in line:
                progress = 'Normal'
            elif 'Q-Chem fatal error' in line or 'fatal error occurred' in line:
                progress = 'Error'
        level, bs = method, basis
        if smd:
            solvation_model = 'SMD,' + solvent_name if solvent_name else 'SMD'
        elif cpcm:
            solvation_model = 'CPCM,' + solvent_name if solvent_name else 'CPCM'

    # ASE extxyz: pull metadata directly from the comment-line key=values
    elif program == 'ase':
        info = _parse_extxyz_comment(data[1]) if len(data) >= 2 else {}
        lot = info.get('level_of_theory', '')
        if '/' in lot:
            level, bs = lot.split('/', 1)
        elif lot:
            level, bs = lot, 'none'
        else:
            level, bs = 'none', 'none'
        solvation_model = info.get('solvation_model', 'gas phase') or 'gas phase'
        try:
            float(info.get('scf_energy', ''))
            progress = 'Normal'
        except ValueError:
            progress = 'Incomplete'

    # ORCA parsing for solvation model and level of theory
    elif program == 'Orca':
        level, bs = _parse_orca_lot(data)
        keyword_line_1 = "gas phase"
        keyword_line_2 = ''
        keyword_line_3 = ''
        orca_abort_line = -1
        last_freq_line = -1
        for i, line in enumerate(data):
            if 'CPCM SOLVATION MODEL' in line.strip():
                keyword_line_1 = "CPCM,"
            if 'SMD CDS free energy correction energy' in line.strip():
                keyword_line_2 = "SMD,"
            if "Solvent:              " in line.strip():
                keyword_line_3 = line.strip().split()[-1]
            if 'ORCA TERMINATED NORMALLY' in line:
                progress = 'Normal'
            elif 'error termination' in line:
                progress = 'Error'
            if 'ORCA will abort' in line:
                orca_abort_line = i
            if 'VIBRATIONAL FREQUENCIES' in line and '---' not in line:
                last_freq_line = i
        # Abort after the last freq section means no valid thermo was produced
        if orca_abort_line > last_freq_line:
            progress = 'Error'
        solvation_model = keyword_line_1 + keyword_line_2 + keyword_line_3
    level_of_theory = '/'.join([level, bs])

    return level_of_theory, solvation_model, progress, orientation, dft_used



[docs]
def gaussian_jobtype(filename):
    """Read the jobtype from a Gaussian archive string."""
    job = ''
    with open(filename, encoding='utf-8', errors='replace') as f:
        for line in f:
            if line.strip().find('\\SP\\') > -1:
                job += 'SP'
            if line.strip().find('\\FOpt\\') > -1:
                job += 'GS'
            if line.strip().find('\\FTS\\') > -1:
                job += 'TS'
            if line.strip().find('\\Freq\\') > -1:
                job += 'Freq'
    return job




[docs]
def parse_gaussian_thermo(file):
    """Parse Gaussian output for all thermochemistry-relevant data.

    Returns QCData with raw frequencies (negative = imaginary, no inversion
    applied). Frequency inversion is a user policy decision handled in
    thermo.py.

    Parameters
    ----------
    file : str
        Path to Gaussian output file.
    """
    qcdata = QCData(file=file, program='Gaussian')

    # Delegate solvation, dispersion, version, charge to parse_data
    (_, _, version_program, solvation_model, _, charge,
     empirical_dispersion, multiplicity) = parse_data(file)
    qcdata.version_program = version_program
    qcdata.solvation_model = solvation_model
    qcdata.empirical_dispersion = empirical_dispersion
    if charge is not None:
        qcdata.charge = charge
    if multiplicity is not None:
        qcdata.multiplicity = multiplicity

    # Job type from archive string
    qcdata.job_type = gaussian_jobtype(file)

    # Read file
    with open(file, encoding='utf-8', errors='replace') as f:
        g_output = f.readlines()

    # Auto-detect G4 composite method
    g4 = any('G4(0 K)' in line for line in g_output)

    # Detect ONIOM
    is_oniom = any('ONIOM: extrapolated energy' in line for line in g_output)
    qcdata.has_oniom = is_oniom

    # --- First pass: find link structure ---
    linkmax = 0
    freqloc = 0
    for line in g_output:
        if 'Normal termination' in line:
            linkmax += 1
        if 'Frequencies --' in line:
            freqloc = linkmax

    # --- Second pass: extract data ---
    link = 0
    frequency_wn = []
    im_frequency_wn = []
    fract_modelsys = []
    per_atom_masses = []  # Gaussian's per-atom masses (respects iso= keyword)
    freq_started = False  # True once we encounter the first "Frequencies --" in this link
    freq_done = False     # True once VPT2 "Recovering" marker is seen (guards against duplicates)

    if freqloc == 0:
        freqloc = len(g_output)

    for i, line in enumerate(g_output):
        # Link counter
        if 'Normal termination' in line:
            link += 1
            if link == freqloc:
                frequency_wn = []
                im_frequency_wn = []
                fract_modelsys = []
                freq_started = False
                freq_done = False

        # Stop after freq link unless G4/composite
        if not g4 and link > freqloc:
            break

        # VPT2/anharmonic: "Recovering previously computed normal modes" means
        # the frequencies about to appear are a duplicate of the harmonic set
        if 'Recovering previously computed normal modes' in line:
            freq_done = True

        # Frequencies
        if not freq_done and line.strip().startswith('Frequencies -- '):
            freq_started = True
            if is_oniom:
                fract_line = g_output[i + 3]
            for j in range(2, 5):
                try:
                    x = float(line.strip().split()[j])
                    if x > 0.0:
                        frequency_wn.append(x)
                        if is_oniom:
                            try:
                                y = float(fract_line.strip().split()[j]) / 100.0
                                y = float('{:.6f}'.format(y))
                                fract_modelsys.append(y)
                            except (IndexError, ValueError):
                                fract_modelsys.append(1.0)
                    elif x < 0.0:
                        im_frequency_wn.append(x)
                except IndexError:
                    pass

        # --- SCF energy (all variants, last one wins) ---
        # Guard against VPT2 displaced geometry energies: once frequencies are
        # read, ignore further SCF Done / E2 / EUMP2 lines (but still allow
        # archive-line energies like CCSD= which appear after frequencies).
        elif not freq_started and line.strip().startswith('SCF Done:'):
            qcdata.scf_energy = float(line.strip().split()[4])
        elif not freq_started and line.strip().startswith('E2('):
            spe_value = line.strip().split()[-1]
            qcdata.scf_energy = float(spe_value.replace('D', 'E'))
        elif line.strip().startswith('Counterpoise corrected energy'):
            qcdata.scf_energy = float(line.strip().split()[4])
        elif not freq_started and 'EUMP2 =' in line.strip():
            qcdata.scf_energy = float((line.strip().split()[5]).replace('D', 'E'))
        elif 'CCSD(T)=' in line.strip():
            raw = line.strip().split('CCSD(T)=')[1].split('\\')[0].split()[0]
            qcdata.scf_energy = float(raw.replace('D', 'E'))
        elif 'CCSD=' in line.strip() and 'CCSD(T)' not in line.strip():
            raw = line.strip().split('CCSD=')[1].split('\\')[0].split()[0]
            qcdata.scf_energy = float(raw.replace('D', 'E'))
        elif 'ONIOM: extrapolated energy' in line.strip():
            qcdata.scf_energy = float(line.strip().split()[4])
        elif line.strip().startswith('G4(0 K)'):
            qcdata.scf_energy = float(line.strip().split()[2])
            if qcdata.zero_point_corr is not None:
                qcdata.scf_energy -= qcdata.zero_point_corr
        elif line.strip().startswith('E(ZPE)='):
            qcdata.zero_point_corr = float(line.strip().split()[1])
        elif 'E(TD-HF/TD-DFT)' in line.strip():
            qcdata.scf_energy = float(line.strip().split()[4])
        elif ('Energy= ' in line.strip()
              and 'Predicted' not in line.strip()
              and 'Thermal' not in line.strip()
              and 'G4' not in line.strip()):
            qcdata.scf_energy = float(line.strip().split()[1])

        # Coordinates: take the LAST "Standard orientation" or "Input orientation" block
        elif 'Standard orientation:' in line or 'Input orientation:' in line:
            atom_nums_tmp = []
            cartesians_tmp = []
            for k in range(i + 5, len(g_output)):
                if '-----' in g_output[k]:
                    break
                parts = g_output[k].split()
                atom_nums_tmp.append(int(parts[1]))
                cartesians_tmp.append([float(parts[3]), float(parts[4]), float(parts[5])])
            qcdata.atom_nums = atom_nums_tmp
            qcdata.cartesians = cartesians_tmp

        # Zero-point correction
        elif line.strip().startswith('Zero-point correction='):
            qcdata.zero_point_corr = float(line.strip().split()[2])

        # Multiplicity
        elif 'Multiplicity' in line.strip():
            try:
                qcdata.multiplicity = int(line.split('=')[-1].strip().split()[0])
            except (ValueError, IndexError):
                qcdata.multiplicity = int(line.split()[-1])

        # Per-atom isotope masses Gaussian printed for this freq job. Captured
        # so we can recompute high-precision rotational constants from the
        # geometry — the "Rotational temperatures" block prints only 3 sig figs,
        # which loses ~3 µHartree in G for heavy or floppy systems.
        elif (line.strip().startswith('Atom')
              and 'has atomic number' in line
              and 'and mass' in line):
            try:
                per_atom_masses.append(float(line.split('and mass')[1].split()[0]))
            except (ValueError, IndexError):
                pass

        # Molecular mass — also marks the end of one per-atom mass block, so
        # subsequent thermo sections (rare; G4 composites) start fresh.
        elif line.strip().startswith('Molecular mass:'):
            qcdata.molecular_mass = float(line.strip().split()[2])

        # Rotational symmetry number
        elif line.strip().startswith('Rotational symmetry number'):
            qcdata.symmno = int((line.strip().split()[3]).split('.')[0])

        # Point group / linearity
        elif line.strip().startswith('Full point group'):
            pg = line.strip().split()[3]
            qcdata.point_group = pg
            if pg in ('D*H', 'C*V'):
                qcdata.linear_mol = True

        # Rotational constants (GHz)
        elif line.strip().startswith('Rotational constants (GHZ):'):
            try:
                parts = line.strip().replace(':', ' ').split()
                qcdata.roconst = [float(parts[3]), float(parts[4]), float(parts[5])]
            except ValueError:
                if '********' in line.strip():
                    qcdata.linear_warning = True
                    parts = line.strip().replace(':', ' ').split()
                    qcdata.roconst = [float(parts[4]), float(parts[5])]

        # Rotational temperatures
        elif line.strip().startswith('Rotational temperature '):
            qcdata.rotemp = [float(line.strip().split()[3])]
        elif line.strip().startswith('Rotational temperatures'):
            try:
                qcdata.rotemp = [float(line.strip().split()[3]),
                                 float(line.strip().split()[4]),
                                 float(line.strip().split()[5])]
            except ValueError:
                if '********' in line.strip():
                    qcdata.linear_warning = True
                    qcdata.rotemp = [float(line.strip().split()[4]),
                                     float(line.strip().split()[5])]

        # CPU time (not elif — checked independently for every line)
        if 'Job cpu time' in line.strip():
            qcdata.cpu = [
                int(line.split()[3]) + qcdata.cpu[0],
                int(line.split()[5]) + qcdata.cpu[1],
                int(line.split()[7]) + qcdata.cpu[2],
                0 + qcdata.cpu[3],
                int(float(line.split()[9]) * 1000.0) + qcdata.cpu[4],
            ]

    qcdata.frequency_wn = frequency_wn
    qcdata.im_frequency_wn = im_frequency_wn
    if is_oniom:
        qcdata.fract_modelsys = fract_modelsys
    if qcdata.atom_nums:
        qcdata.atom_types = [periodictable[n] for n in qcdata.atom_nums]

    # High-precision rotational constants: Gaussian's "Rotational temperatures"
    # printout is only 3 sig figs, which propagates ~3 µHartree of error into G
    # via S_rot for systems with small θ_rot. Recompute MOI from the geometry
    # using the per-atom masses Gaussian itself reported (this respects the
    # iso= keyword), then derive roconst / rotemp at full double precision.
    # G4 composites can print masses for several thermo sections; the final
    # len(cartesians) entries always belong to the freq job we care about.
    if (qcdata.cartesians
            and len(per_atom_masses) >= len(qcdata.cartesians)
            and not qcdata.linear_warning
            and len(qcdata.rotemp) == 3):
        masses = np.asarray(per_atom_masses[-len(qcdata.cartesians):], float)
        coords = np.asarray(qcdata.cartesians, float)
        total_mass = float(masses.sum())
        com = (masses[:, None] * coords).sum(axis=0) / total_mass
        r = coords - com
        Ixx = float((masses * (r[:, 1] ** 2 + r[:, 2] ** 2)).sum())
        Iyy = float((masses * (r[:, 0] ** 2 + r[:, 2] ** 2)).sum())
        Izz = float((masses * (r[:, 0] ** 2 + r[:, 1] ** 2)).sum())
        Ixy = float(-(masses * r[:, 0] * r[:, 1]).sum())
        Ixz = float(-(masses * r[:, 0] * r[:, 2]).sum())
        Iyz = float(-(masses * r[:, 1] * r[:, 2]).sum())
        inertia = np.array([[Ixx, Ixy, Ixz], [Ixy, Iyy, Iyz], [Ixz, Iyz, Izz]])
        eigvals = np.sort(np.maximum(np.linalg.eigvalsh(inertia), 0.0))
        if all(I > 1e-3 for I in eigvals):
            HC_OVER_KB = 1.4387768775  # K per cm⁻¹
            roconst_cm = [16.857629 / I for I in eigvals]
            qcdata.roconst = [b * 29.9792458 for b in roconst_cm]
            qcdata.rotemp = [HC_OVER_KB * b for b in roconst_cm]
            qcdata.molecular_mass = total_mass

    _fill_mass_and_rotemp_from_geometry(qcdata)

    return qcdata




[docs]
def parse_nwchem_thermo(file):
    """Parse NWChem output for all thermochemistry-relevant data.

    Returns QCData with raw frequencies (negative = imaginary, no inversion
    applied).

    Parameters
    ----------
    file : str
        Path to NWChem output file.
    """
    qcdata = QCData(file=file, program='NWChem')

    # Delegate version, charge, multiplicity to parse_data
    try:
        (_, _, version_program, solvation_model, _, charge,
         empirical_dispersion, multiplicity) = parse_data(file)
        qcdata.version_program = version_program
        qcdata.solvation_model = solvation_model
        qcdata.empirical_dispersion = empirical_dispersion
        if charge is not None:
            qcdata.charge = charge
        if multiplicity is not None:
            qcdata.multiplicity = multiplicity
    except (ValueError, IndexError):
        pass

    # NWChem has no archive string for job type; detect from content
    with open(file, encoding='utf-8', errors='replace') as f:
        g_output = f.readlines()

    frequency_wn = []
    im_frequency_wn = []

    for i, line in enumerate(g_output):
        # Frequencies (up to 6 per line)
        if line.strip().startswith('P.Frequency'):
            for j in range(1, 7):
                try:
                    x = float(line.strip().split()[j])
                    if x > 0.0:
                        frequency_wn.append(x)
                    elif x < 0.0:
                        im_frequency_wn.append(x)
                except IndexError:
                    pass

        # SCF energy
        elif line.strip().startswith('Total DFT energy ='):
            qcdata.scf_energy = float(line.strip().split()[4])

        # Zero-point correction
        elif line.strip().startswith('Zero-Point'):
            qcdata.zero_point_corr = float(line.strip().split()[8])

        # Multiplicity — match input-deck echo "mult <N>" or runtime
        # "Spin multiplicity: <N>" (NWChem 7.x).
        elif 'mult ' in line.strip():
            try:
                qcdata.multiplicity = int(line.split()[1])
            except (ValueError, IndexError):
                qcdata.multiplicity = 1
        elif 'Spin multiplicity:' in line:
            try:
                qcdata.multiplicity = int(line.split(":", 1)[1].strip().split()[0])
            except (ValueError, IndexError):
                pass

        # Coordinates: take the LAST "Output coordinates in angstroms" block
        elif 'Output coordinates in angstroms' in line:
            atom_nums_tmp = []
            cartesians_tmp = []
            for k in range(i + 4, len(g_output)):
                cline = g_output[k].strip()
                if cline == '':
                    break
                parts = cline.split()
                atom_nums_tmp.append(int(float(parts[2])))
                cartesians_tmp.append([float(parts[3]), float(parts[4]), float(parts[5])])
            qcdata.atom_nums = atom_nums_tmp
            qcdata.cartesians = cartesians_tmp

        # Molecular mass
        elif line.strip().find('mol. weight') != -1:
            qcdata.molecular_mass = float(line.strip().split()[-1][0:-1])

        # Rotational symmetry number
        elif line.strip().find('symmetry #') != -1:
            qcdata.symmno = int(line.strip().split()[-1][0:-1])

        # Point group / linearity
        elif line.strip().find('symmetry detected') != -1:
            pg = line.strip().split()[0]
            qcdata.point_group = pg
            if pg in ('D*H', 'C*V'):
                qcdata.linear_mol = True

        # Version (fallback if parse_data failed)
        elif 'nwchem branch' in line.strip() and not qcdata.version_program:
            qcdata.version_program = 'NWChem version ' + line.split()[3]

        # Rotational constants (convert cm⁻¹ to GHz) and temperatures
        elif line.strip().startswith('A=') or line.strip().startswith('B=') or line.strip().startswith('C='):
            letter = line.strip()[0]
            h = {'A': 0, 'B': 1, 'C': 2}[letter]
            qcdata.roconst[h] = float(line.strip().split()[1]) * 29.9792458
            qcdata.rotemp[h] = float(line.strip().split()[4])

        # CPU time
        if 'Total times' in line.strip():
            secs = float(line.strip().split()[3][0:-1])
            qcdata.cpu = [0, 0, 0, secs, 0.0]

    # Determine job type from content
    if len(frequency_wn) > 0 or len(im_frequency_wn) > 0:
        qcdata.job_type = 'Freq'
    else:
        qcdata.job_type = 'SP'

    qcdata.frequency_wn = frequency_wn
    qcdata.im_frequency_wn = im_frequency_wn
    if qcdata.atom_nums:
        qcdata.atom_types = [periodictable[n] for n in qcdata.atom_nums]

    _fill_mass_and_rotemp_from_geometry(qcdata)

    return qcdata



def _scale_cpu(cpu, factor):
    """Multiply a [days, hours, mins, secs, msecs] tuple by an integer factor.

    Used by the ORCA parser to convert wall time × nprocs → effective CPU.
    """
    total_ms = (
        cpu[0] * 86_400_000
        + cpu[1] * 3_600_000
        + cpu[2] * 60_000
        + cpu[3] * 1_000
        + cpu[4]
    ) * factor
    days, total_ms = divmod(total_ms, 86_400_000)
    hours, total_ms = divmod(total_ms, 3_600_000)
    mins, total_ms = divmod(total_ms, 60_000)
    secs, msecs = divmod(total_ms, 1_000)
    return [days, hours, mins, secs, msecs]



[docs]
def parse_orca_thermo(file):
    """Parse ORCA output for all thermochemistry-relevant data.

    Uses native line-by-line parsing.

    Parameters
    ----------
    file : str
        Path to ORCA output file.
    """
    qcdata = QCData(file=file, program='Orca')

    with open(file, encoding='utf-8', errors='replace') as f:
        output = f.readlines()

    frequency_wn = []
    im_frequency_wn = []
    in_freq_section = False
    solvation_type = ''
    solvent_name = ''
    applied_freq_scale_factor = 1.0
    _has_opt = False
    _has_ts = False
    _has_freq_kw = False
    _orca_abort_line = -1
    _last_freq_line = -1

    for i, line in enumerate(output):
        stripped = line.strip()

        # --- Frequency section state machine ---
        if 'VIBRATIONAL FREQUENCIES' in stripped and '---' not in stripped:
            in_freq_section = True
            frequency_wn = []
            im_frequency_wn = []
            _last_freq_line = i
        elif in_freq_section and 'cm**-1' in stripped:
            parts = stripped.split()
            try:
                freq_val = float(parts[1])
                if freq_val > 0.0:
                    frequency_wn.append(freq_val)
                elif freq_val < 0.0:
                    im_frequency_wn.append(freq_val)
                # freq_val == 0.0 → skip (translational/rotational modes)
            except (IndexError, ValueError):
                pass
        elif in_freq_section:
            if stripped == '' and (frequency_wn or im_frequency_wn):
                in_freq_section = False

        # --- Main field parsing ---
        # Coordinates: take the LAST "CARTESIAN COORDINATES (ANGSTROEM)" block
        if 'CARTESIAN COORDINATES (ANGSTROEM)' in stripped:
            atom_nums_tmp = []
            atom_types_tmp = []
            cartesians_tmp = []
            for k in range(i + 2, len(output)):
                cline = output[k].strip()
                if cline == '' or '---' in cline:
                    break
                parts = cline.split()
                elem = parts[0]
                atom_types_tmp.append(elem)
                atom_nums_tmp.append(element_id(elem, num=True))
                cartesians_tmp.append([float(parts[1]), float(parts[2]), float(parts[3])])
            qcdata.atom_nums = atom_nums_tmp
            qcdata.atom_types = atom_types_tmp
            qcdata.cartesians = cartesians_tmp

        # SCF energy (last occurrence wins for linked jobs)
        elif stripped.startswith('FINAL SINGLE POINT ENERGY'):
            qcdata.scf_energy = float(stripped.split()[-1])

        # Collect input keywords for job type detection (handled after loop)
        elif '|' in line and '>' in stripped and '!' in stripped:
            kw = stripped.split('!', 1)[1].upper()
            if 'OPTTS' in kw:
                _has_ts = True
            elif 'OPT' in kw:
                _has_opt = True
            if 'FREQ' in kw:
                _has_freq_kw = True

        # Charge
        elif 'Total Charge' in stripped and '....' in stripped:
            qcdata.charge = int(stripped.split()[-1])

        # Multiplicity
        elif 'Multiplicity' in stripped and 'Mult' in stripped and '....' in stripped:
            qcdata.multiplicity = int(stripped.split()[-1])

        # Program version
        elif 'Program Version' in stripped:
            qcdata.version_program = 'ORCA version ' + line.split()[2]

        # Zero-point energy (Eh)
        elif stripped.startswith('Zero point energy') and '...' in stripped:
            parts = stripped.split()
            try:
                dot_idx = parts.index('...')
                qcdata.zero_point_corr = float(parts[dot_idx + 1])
            except (ValueError, IndexError):
                pass

        # Molecular mass (AMU)
        elif 'Total Mass' in stripped and '...' in stripped:
            parts = stripped.split()
            try:
                dot_idx = parts.index('...')
                qcdata.molecular_mass = float(parts[dot_idx + 1])
            except (ValueError, IndexError):
                pass

        # Rotational constants in cm⁻¹ → convert to GHz and rotational temps
        elif stripped.startswith('Rotational constants in cm-1:'):
            rparts = stripped.split(':')[1].split()
            try:
                roconst_cm = [float(rparts[0]), float(rparts[1]), float(rparts[2])]
                qcdata.roconst = [b * 29.9792458 for b in roconst_cm]
                # Rotational temperature: T_rot = h*c*B/k_B where B in cm⁻¹
                HC_OVER_KB = 1.4387768775  # K per cm⁻¹
                qcdata.rotemp = [HC_OVER_KB * b for b in roconst_cm]
            except (IndexError, ValueError):
                pass

        # Point group and symmetry number (same line)
        elif 'Point Group:' in stripped and 'Symmetry Number:' in stripped:
            pg_part = stripped.split('Point Group:')[1].split(',')[0].strip()
            qcdata.point_group = pg_part
            # Linear point groups: ORCA 6 prints "C(inf)v"/"D(inf)h",
            # ORCA 5 prints "Cinfv"/"Dinfh" — match both via the "inf" stem.
            if 'inf' in pg_part.lower():
                qcdata.linear_mol = True
            try:
                symm_part = stripped.split('Symmetry Number:')[1].strip()
                qcdata.symmno = int(symm_part)
            except (ValueError, IndexError):
                pass

        # Solvation model detection
        elif 'CPCM SOLVATION MODEL' in stripped:
            solvation_type = 'CPCM'
        elif 'SMD CDS free energy correction energy' in stripped:
            solvation_type = 'SMD'
        elif stripped.startswith('Solvent:') and '...' in stripped:
            solvent_name = stripped.split()[-1]

        # Frequency scaling factor (ORCA pre-applies this to reported frequencies)
        elif stripped.startswith('Scaling factor for frequencies'):
            try:
                applied_freq_scale_factor = float(stripped.split('=')[1].split('(')[0].strip())
            except (IndexError, ValueError):
                pass

        # CPU time. ORCA prints wall time only ("TOTAL RUN TIME:"); we scale
        # by the number of MPI processes (parsed from "Program running with
        # N parallel MPI-processes") to get an effective CPU time, matching
        # how Gaussian/NWChem/xTB report theirs.
        elif stripped.startswith('TOTAL RUN TIME:'):
            parts = stripped.split()
            try:
                cpu_list = [
                    int(parts[3]),   # days
                    int(parts[5]),   # hours
                    int(parts[7]),   # minutes
                    int(parts[9]),   # seconds
                    int(parts[11]),  # msec
                ]
                if qcdata.nprocs > 1:
                    cpu_list = _scale_cpu(cpu_list, qcdata.nprocs)
                qcdata.cpu = cpu_list
            except (IndexError, ValueError):
                pass

        # MPI process count (printed once per task; we keep the first hit).
        elif 'parallel MPI-processes' in stripped and qcdata.nprocs == 1:
            try:
                qcdata.nprocs = int(stripped.split('with')[1].split('parallel')[0].strip())
            except (IndexError, ValueError):
                pass

        # ORCA abort: optimization failed before freq calculation could run.
        if 'ORCA will abort' in stripped:
            _orca_abort_line = i

    # If the last abort comes after the last freq section (or there are no freqs),
    # any frequencies/ZPE present are from calc_hess, not the actual freq job — discard.
    if _orca_abort_line > _last_freq_line:
        frequency_wn = []
        im_frequency_wn = []
        qcdata.zero_point_corr = None

    # Assemble solvation model
    if solvation_type:
        if solvent_name:
            qcdata.solvation_model = solvation_type + ',' + solvent_name
        else:
            qcdata.solvation_model = solvation_type

    # Un-scale frequencies if ORCA applied a scaling factor, so that
    # frequency_wn always contains unscaled values (matching Gaussian convention)
    qcdata.applied_freq_scale_factor = applied_freq_scale_factor
    if applied_freq_scale_factor != 1.0 and applied_freq_scale_factor > 0.0:
        frequency_wn = [f / applied_freq_scale_factor for f in frequency_wn]
        im_frequency_wn = [f / applied_freq_scale_factor for f in im_frequency_wn]
    qcdata.frequency_wn = frequency_wn
    qcdata.im_frequency_wn = im_frequency_wn

    # Fallback: parse coordinates from "* xyz charge mult" input block
    # when CARTESIAN COORDINATES (ANGSTROEM) block is absent (e.g., thermo-only re-analysis)
    if not qcdata.atom_nums:
        for i, line in enumerate(output):
            stripped = line.strip()
            if stripped.startswith('|') and '* xyz' in stripped:
                atom_nums_tmp = []
                atom_types_tmp = []
                cartesians_tmp = []
                for k in range(i + 1, len(output)):
                    cline = output[k].strip()
                    # Input lines start with "| N>" prefix
                    if '|' in cline:
                        content = cline.split('>', 1)[-1].strip() if '>' in cline else cline.strip('| ').strip()
                    else:
                        break
                    if content == '*' or content == '' or 'END OF INPUT' in content:
                        break
                    parts = content.split()
                    if len(parts) >= 4:
                        elem = parts[0]
                        atom_types_tmp.append(elem)
                        atom_nums_tmp.append(element_id(elem, num=True))
                        cartesians_tmp.append([float(parts[1]), float(parts[2]), float(parts[3])])
                if atom_nums_tmp:
                    qcdata.atom_nums = atom_nums_tmp
                    qcdata.atom_types = atom_types_tmp
                    qcdata.cartesians = cartesians_tmp
                break

    # For linear molecules, filter rotemp to non-zero values only
    # (linear molecules have one near-zero rotational constant along the axis)
    if qcdata.linear_mol:
        nonzero_rotemp = [t for t in qcdata.rotemp if t > 1e-10]
        if nonzero_rotemp:
            qcdata.rotemp = nonzero_rotemp

    # Determine job type from combined input keywords and output content
    has_freq = _has_freq_kw or len(frequency_wn) > 0 or len(im_frequency_wn) > 0
    if _has_ts:
        qcdata.job_type = 'TSFreq' if has_freq else 'TS'
    elif _has_opt:
        qcdata.job_type = 'GSFreq' if has_freq else 'GS'
    elif has_freq:
        qcdata.job_type = 'Freq'
    else:
        qcdata.job_type = 'SP'

    _fill_mass_and_rotemp_from_geometry(qcdata)

    return qcdata



def _xtb_solvation_from_call(call_line):
    """Parse the xtb ``program call`` line for an implicit-solvation flag.

    Returns a string like 'GBSA,water', 'CPCM-X,dimethylsulfoxide',
    'ALPB,toluene', or 'gas phase'.
    """
    tokens = call_line.split()
    for i, tok in enumerate(tokens):
        if tok in ('-g', '--gbsa') and i + 1 < len(tokens):
            return 'GBSA,' + tokens[i + 1]
        if tok == '--cpcmx' and i + 1 < len(tokens):
            return 'CPCM-X,' + tokens[i + 1]
        if tok == '--alpb' and i + 1 < len(tokens):
            return 'ALPB,' + tokens[i + 1]
    return 'gas phase'


def _xtb_read_xyz_fallback(file):
    """Read the paired .xyz file when the .out has no final-geometry block.

    Returns (atom_nums, atom_types, cartesians) — empty lists if not readable.
    """
    xyz = os.path.splitext(file)[0] + '.xyz'
    if not os.path.exists(xyz):
        return [], [], []
    try:
        with open(xyz, encoding='utf-8', errors='replace') as f:
            lines = f.readlines()
    except OSError:
        return [], [], []
    if len(lines) < 2:
        return [], [], []
    try:
        natoms = int(lines[0].strip())
    except (ValueError, IndexError):
        return [], [], []
    atom_nums, atom_types, cartesians = [], [], []
    for line in lines[2:2 + natoms]:
        parts = line.split()
        if len(parts) < 4:
            continue
        elem = parts[0]
        try:
            atom_types.append(elem)
            atom_nums.append(element_id(elem, num=True))
            cartesians.append([float(parts[1]), float(parts[2]), float(parts[3])])
        except ValueError:
            continue
    return atom_nums, atom_types, cartesians



[docs]
def parse_xtb_thermo(file):
    """Parse xtb output for all thermochemistry-relevant data.

    xtb takes a ``.xyz`` coordinate file plus CLI flags rather than an input
    deck. The parser handles single-point (``xtb file.xyz``), Hessian-only
    (``--hess``), and optimization+Hessian (``--ohess``) jobs, plus implicit
    solvation (GBSA via ``-g``, ALPB via ``--alpb``, CPCM-X via ``--cpcmx``).

    For runs without ``--opt``/``--ohess`` (no optimized-geometry block in the
    output), Cartesians fall back to the paired ``.xyz`` input file.

    Parameters
    ----------
    file : str
        Path to xtb output file.
    """
    qcdata = QCData(file=file, program='xtb')

    with open(file, encoding='utf-8', errors='replace') as f:
        output = f.readlines()

    # Empty / aborted runs: bail out early but record the program.
    if not output or any('[ERROR]' in line for line in output):
        if any('xtb version' in line for line in output):
            for line in output:
                if 'xtb version' in line:
                    parts = line.split()
                    try:
                        qcdata.version_program = 'xtb version ' + parts[parts.index('version') + 1]
                    except (ValueError, IndexError):
                        pass
                    break
        return qcdata

    HC_OVER_KB = 1.4387768775  # K per cm⁻¹

    frequency_wn = []
    im_frequency_wn = []
    last_freq_idx = -1
    program_call = ''

    # First pass: locate the LAST vibrational-frequencies header.  Linear
    # molecules and atoms use the ``vibrational frequencies (cm⁻¹)`` form;
    # everything else uses ``projected vibrational frequencies (cm⁻¹)``.
    for i, line in enumerate(output):
        if 'vibrational frequencies' in line and '(cm' in line:
            last_freq_idx = i

    # Second pass: extract scalar fields and geometry; freq parsing happens once
    for i, line in enumerate(output):
        stripped = line.strip()

        # --- Version
        if 'xtb version' in stripped and not qcdata.version_program:
            parts = stripped.split()
            try:
                qcdata.version_program = 'xtb version ' + parts[parts.index('version') + 1]
            except (ValueError, IndexError):
                pass

        # --- Program call line (for solvation flag parsing)
        elif 'program call' in stripped and ':' in stripped and not program_call:
            program_call = stripped.split(':', 1)[1].strip()

        # --- Charge: ":  net charge   <n>   :"
        elif 'net charge' in stripped and stripped.startswith(':'):
            parts = stripped.split()
            try:
                qcdata.charge = int(parts[parts.index('charge') + 1])
            except (ValueError, IndexError):
                pass

        # --- Multiplicity from "unpaired electrons" (mult = unpaired + 1)
        elif 'unpaired electrons' in stripped and stripped.startswith(':'):
            parts = stripped.split()
            try:
                unpaired = int(parts[parts.index('electrons') + 1])
                qcdata.multiplicity = unpaired + 1
            except (ValueError, IndexError):
                pass

        # --- Electronic SCF energy ":: total energy   <E> Eh ::"
        # Last occurrence wins (final SCC after opt, or sole SCC for SP)
        elif stripped.startswith(':: total energy'):
            parts = stripped.split()
            try:
                qcdata.scf_energy = float(parts[3])
            except (ValueError, IndexError):
                pass

        # --- Zero-point energy
        elif stripped.startswith(':: zero point energy'):
            parts = stripped.split()
            try:
                qcdata.zero_point_corr = float(parts[4])
            except (ValueError, IndexError):
                pass

        # --- Molecular mass: "molecular mass/u    :   <m>"
        elif stripped.startswith('molecular mass/u'):
            parts = stripped.split()
            try:
                qcdata.molecular_mass = float(parts[-1])
            except (ValueError, IndexError):
                pass

        # --- Rotational constants in cm⁻¹: "rotational constants/cm⁻¹ : B1 B2 B3"
        # xtb prints "Infinity" for atoms (no rotation); skip in that case.
        elif stripped.startswith('rotational constants/cm'):
            parts = stripped.split(':', 1)[1].split() if ':' in stripped else stripped.split()
            if 'Infinity' in parts[:3]:
                pass  # atom: leave roconst/rotemp at defaults
            else:
                try:
                    roconst_cm = [float(parts[0]), float(parts[1]), float(parts[2])]
                    qcdata.roconst = [b * 29.9792458 for b in roconst_cm]
                    qcdata.rotemp = [HC_OVER_KB * b for b in roconst_cm]
                except (IndexError, ValueError):
                    pass

        # --- Linear marker (thermo SETUP block)
        elif 'linear?' in stripped and stripped.startswith(':'):
            if 'true' in stripped:
                qcdata.linear_mol = True

        # --- Point group from "symmetry  <pg>"
        elif stripped.startswith(':') and 'symmetry' in stripped and 'rotational' not in stripped:
            parts = stripped.split()
            try:
                idx = parts.index('symmetry')
                if idx + 1 < len(parts) and parts[idx + 1] != ':':
                    pg = parts[idx + 1]
                    if pg not in ('GBSA', 'found', 'elements:'):
                        qcdata.point_group = pg
            except ValueError:
                pass

        # --- Symmetry number ":  rotational number  <n>  :"
        elif 'rotational number' in stripped and stripped.startswith(':'):
            parts = stripped.split()
            try:
                qcdata.symmno = int(parts[parts.index('number') + 1])
            except (ValueError, IndexError):
                pass

        # --- Final optimized geometry block (only present for --opt/--ohess)
        elif stripped == 'final structure:':
            # Skip the "===" line; line i+1 is "===", i+2 is atom count, i+3 is comment
            try:
                natoms = int(output[i + 2].strip())
            except (ValueError, IndexError):
                continue
            atom_nums_tmp = []
            atom_types_tmp = []
            cartesians_tmp = []
            for k in range(i + 4, i + 4 + natoms):
                if k >= len(output):
                    break
                parts = output[k].split()
                if len(parts) < 4:
                    break
                try:
                    atom_types_tmp.append(parts[0])
                    atom_nums_tmp.append(element_id(parts[0], num=True))
                    cartesians_tmp.append([float(parts[1]), float(parts[2]), float(parts[3])])
                except ValueError:
                    break
            qcdata.atom_nums = atom_nums_tmp
            qcdata.atom_types = atom_types_tmp
            qcdata.cartesians = cartesians_tmp

        # --- CPU time from "total:" block: "* cpu-time: <D> d, <H> h, <M> min, <S> sec"
        # xtb prints two spaces ("*  cpu-time:") so match the substring.
        elif 'cpu-time:' in stripped and stripped.startswith('*') and qcdata.cpu == [0, 0, 0, 0, 0]:
            # First (= "total:") cpu-time line; subsequent are SCF/ANC/hess subtimers
            parts = stripped.replace(',', ' ').split()
            try:
                d_idx = parts.index('d')
                h_idx = parts.index('h')
                m_idx = parts.index('min')
                s_idx = parts.index('sec')
                days = int(parts[d_idx - 1])
                hours = int(parts[h_idx - 1])
                mins = int(parts[m_idx - 1])
                secs_float = float(parts[s_idx - 1])
                secs = int(secs_float)
                msecs = int((secs_float - secs) * 1000)
                qcdata.cpu = [days, hours, mins, secs, msecs]
            except (ValueError, IndexError):
                pass

    # Frequencies from the LAST projected-vibrational-frequencies block.
    # xtb prints all 3N modes; the first 5 (linear) or 6 (non-linear) are the
    # zero translational/rotational eigenvalues from projection — skip them by
    # filtering |v| < 1.0 cm⁻¹. Negatives below that are imaginary modes.
    if last_freq_idx >= 0:
        for k in range(last_freq_idx + 1, len(output)):
            parts = output[k].split()
            if not parts or parts[0] != 'eigval':
                break
            for tok in parts[2:]:
                try:
                    v = float(tok)
                except ValueError:
                    continue
                if abs(v) < 1.0:
                    continue
                if v > 0:
                    frequency_wn.append(v)
                else:
                    im_frequency_wn.append(v)
    qcdata.frequency_wn = frequency_wn
    qcdata.im_frequency_wn = im_frequency_wn

    # Multiplicity: if `unpaired electrons` was missing from output, leave at default 1.

    # Solvation: parse from program call line
    if program_call:
        qcdata.solvation_model = _xtb_solvation_from_call(program_call)
    else:
        qcdata.solvation_model = 'gas phase'

    # xtb has no DFT dispersion; GFN includes its own D3/D4 implicitly.
    qcdata.empirical_dispersion = 'No empirical dispersion detected'

    # Geometry fallback: SP-only and --hess runs lack a 'final structure:' block.
    if not qcdata.atom_nums:
        atom_nums, atom_types, cartesians = _xtb_read_xyz_fallback(file)
        if atom_nums:
            qcdata.atom_nums = atom_nums
            qcdata.atom_types = atom_types
            qcdata.cartesians = cartesians

    # Mass / rotational fallback: --hess-only runs print frequencies but skip
    # the THERMODYNAMIC block, so 'molecular mass / u' and 'rotational
    # constants / cm⁻¹' are absent.  Recompute from the geometry we just loaded.
    _fill_mass_and_rotemp_from_geometry(qcdata)

    # Linear molecules: keep only physical rotational temperatures.  xtb
    # inverts a near-zero moment of inertia for the linear axis, producing
    # either a huge positive number (e.g. 1e27 K) or a huge negative one;
    # restrict to 0 < T_rot < 1e10 K to drop both pathological cases.
    if qcdata.linear_mol and qcdata.rotemp:
        physical = [t for t in qcdata.rotemp if 0.0 < t < 1.0e10]
        if physical:
            qcdata.rotemp = physical
            qcdata.roconst = [t / HC_OVER_KB * 29.9792458 for t in physical]

    # Job type from observed content
    has_opt_geom = any('final structure:' in line for line in output)
    has_freq = bool(frequency_wn) or bool(im_frequency_wn)
    if has_opt_geom and has_freq:
        qcdata.job_type = 'GSFreq'
    elif has_opt_geom:
        qcdata.job_type = 'GS'
    elif has_freq:
        qcdata.job_type = 'Freq'
    else:
        qcdata.job_type = 'SP'

    return qcdata



# Bohr → Ångström conversion (squared, for amu*Bohr² → amu*Å²)
_BOHR2_TO_ANGSTROM2 = 0.5291772109 ** 2



[docs]
def parse_qchem_thermo(file):
    """Parse a Q-Chem 6 output file for all thermochemistry-relevant data.

    Handles single-job and multi-job (`@@@`-separated opt + freq + sp) inputs:
    the LAST occurrence wins for SCF energy, geometry, and frequencies, so
    the converged opt geometry and post-correlation energies are picked up
    correctly. Recognises native HF/DFT total energy, MP2 total energy, CCSD
    total energy, and CCSD(T) total energy via the precedence
    CCSD(T) > CCSD > MP2 > Total energy.

    Parameters
    ----------
    file : str
        Path to a Q-Chem 6 output file.
    """
    qcdata = QCData(file=file, program='QChem')

    with open(file, encoding='utf-8', errors='replace') as f:
        output = f.readlines()

    if not output:
        return qcdata

    HC_OVER_KB = 1.4387768775  # K per cm⁻¹

    # First pass: locate the last freq block, last geometry block, last
    # thermo header, and the input $molecule + $rem echoes (for charge,
    # multiplicity, and the requested correlation method).
    last_geom_header = -1
    last_thermo_header = -1
    last_freq_header = -1
    molecule_block_lines = []
    rem_methods = []
    in_molecule_block = False
    in_rem_block = False
    user_input_seen = False

    for i, line in enumerate(output):
        stripped = line.strip()
        if 'You are running Q-Chem version' in stripped:
            try:
                qcdata.version_program = 'Q-Chem version ' + stripped.split('version:')[1].strip()
            except IndexError:
                pass
        elif 'User input:' in stripped:
            user_input_seen = True
        elif user_input_seen and stripped == '$molecule' and not molecule_block_lines:
            in_molecule_block = True
            continue
        elif in_molecule_block:
            if stripped == '$end' or stripped.startswith('$'):
                in_molecule_block = False
            elif stripped:
                molecule_block_lines.append(stripped)
        elif user_input_seen and stripped.lower() == '$rem':
            in_rem_block = True
            continue
        elif in_rem_block:
            if stripped == '$end' or stripped.startswith('$'):
                in_rem_block = False
            elif stripped and not stripped.startswith('!'):
                parts = stripped.split()
                if len(parts) >= 2 and parts[0].lower() == 'method':
                    rem_methods.append(parts[1].lower())
        if 'Standard Nuclear Orientation (Angstroms)' in line:
            last_geom_header = i
        # Anchor on the harmonic block ("AT 298.15 K AND ..."); skip the
        # anharmonic VPT2/TOSH summaries that print only ZPE.
        if 'STANDARD THERMODYNAMIC QUANTITIES AT' in line:
            last_thermo_header = i
        if stripped.startswith('Frequency:'):
            last_freq_header = i

    # Method of the FINAL job step decides which energy line to trust.
    final_method = rem_methods[-1] if rem_methods else ''

    # Charge / multiplicity from the input $molecule block.
    # First non-`READ` line is "<charge> <mult>".
    for line in molecule_block_lines:
        s = line.strip()
        if s.upper() == 'READ':
            continue
        parts = s.split()
        if len(parts) >= 2:
            try:
                qcdata.charge = int(parts[0])
                qcdata.multiplicity = int(parts[1])
            except ValueError:
                pass
            break

    # Multiplicity fallback from "There are N alpha and M beta electrons".
    if qcdata.multiplicity == 1:
        for line in output:
            if 'alpha and' in line and 'beta electrons' in line:
                parts = line.split()
                try:
                    nalpha = int(parts[parts.index('alpha') - 1])
                    nbeta = int(parts[parts.index('beta') - 1])
                    qcdata.multiplicity = abs(nalpha - nbeta) + 1
                except (ValueError, IndexError):
                    pass

    # Energies — collect candidates; pick by FINAL method, not by precedence.
    # Q-Chem prints "MP2 total energy" as an extra analysis even when the
    # method is e.g. B2PLYP (a double hybrid that's properly captured by the
    # SCF "Total energy" line), so we can't blindly prefer correlated lines.
    scf_total = None
    mp2_total = None
    ccsd_total = None
    ccsdt_total = None
    for line in output:
        stripped = line.strip()
        if stripped.startswith('Total energy ='):
            try:
                scf_total = float(stripped.split('=')[1].split()[0])
            except (IndexError, ValueError):
                pass
        elif 'MP2' in stripped and 'total energy =' in stripped:
            try:
                mp2_total = float(stripped.split('=')[1].split()[0])
            except (IndexError, ValueError):
                pass
        elif stripped.startswith('CCSD total energy') or stripped.startswith('CCSD   total energy'):
            try:
                ccsd_total = float(stripped.split('=')[1].split()[0])
            except (IndexError, ValueError):
                pass
        elif stripped.startswith('CCSD(T) total energy'):
            try:
                ccsdt_total = float(stripped.split('=')[1].split()[0])
            except (IndexError, ValueError):
                pass
    if 'ccsd(t)' in final_method and ccsdt_total is not None:
        qcdata.scf_energy = ccsdt_total
    elif final_method == 'ccsd' and ccsd_total is not None:
        qcdata.scf_energy = ccsd_total
    elif final_method in ('mp2', 'rimp2', 'ump2') and mp2_total is not None:
        qcdata.scf_energy = mp2_total
    else:
        qcdata.scf_energy = scf_total

    # Solvation
    solvation_type = ''
    solvent_name = ''
    for line in output:
        stripped = line.strip()
        if 'Using C-PCM' in stripped or 'C-PCM dielectric factor' in stripped:
            solvation_type = solvation_type or 'CPCM'
        elif 'Using the SMD solvation model' in stripped:
            solvation_type = 'SMD'
        elif stripped.startswith('Solvent:'):
            try:
                solvent_name = stripped.split(':', 1)[1].strip()
            except IndexError:
                pass
    if solvation_type:
        qcdata.solvation_model = (solvation_type + ',' + solvent_name) if solvent_name else solvation_type

    # Final geometry — read from the LAST Standard Nuclear Orientation block.
    if last_geom_header >= 0:
        atom_nums, atom_types, cartesians = [], [], []
        # Skip 2 header lines + the dashes line before atoms.
        i = last_geom_header + 3
        while i < len(output):
            parts = output[i].split()
            if len(parts) >= 5:
                try:
                    int(parts[0])  # row index
                    elem = parts[1]
                    xyz = [float(parts[2]), float(parts[3]), float(parts[4])]
                    atom_types.append(elem)
                    atom_nums.append(element_id(elem, num=True))
                    cartesians.append(xyz)
                    i += 1
                    continue
                except ValueError:
                    pass
            break
        qcdata.atom_types = atom_types
        qcdata.atom_nums = atom_nums
        qcdata.cartesians = cartesians

    # Frequencies — collect from contiguous Frequency: lines starting at the last header.
    if last_freq_header >= 0:
        # Walk forward from the last_freq_header but we want all preceding
        # contiguous Frequency: lines that belong to the same final block.
        # Strategy: scan the file once more and collect freqs from the LAST
        # contiguous run of Frequency: lines (separated only by columns of
        # mode data, never by another major section header).
        freq_blocks = []
        current = []
        for line in output:
            stripped = line.strip()
            if stripped.startswith('Frequency:'):
                try:
                    nums = [float(x) for x in stripped.split(':', 1)[1].split()]
                    current.extend(nums)
                except ValueError:
                    pass
            elif current and ('STANDARD THERMODYNAMIC' in line or
                              'Standard Nuclear Orientation' in line):
                # End of a freq block; new section ahead.
                freq_blocks.append(current)
                current = []
        if current:
            freq_blocks.append(current)
        if freq_blocks:
            freqs = freq_blocks[-1]
            qcdata.frequency_wn = [f for f in freqs if f >= 0]
            qcdata.im_frequency_wn = [f for f in freqs if f < 0]

    # Point group: printed once per SCF (before the thermo block); take LAST.
    for line in output:
        if 'Molecular Point Group' in line:
            try:
                after = line.split('Molecular Point Group', 1)[1].strip()
                qcdata.point_group = after.split()[0]
            except (IndexError, ValueError):
                pass

    # Thermo block: ZPE, mass, symmno, principal moments (last block wins).
    if last_thermo_header >= 0:
        for line in output[last_thermo_header:]:
            stripped = line.strip()
            if stripped.startswith('Zero point vibrational energy:'):
                # "Zero point vibrational energy: 14.050 kcal/mol"
                try:
                    zpe_kcal = float(stripped.split(':')[1].split()[0])
                    qcdata.zero_point_corr = zpe_kcal / 627.509541
                except (IndexError, ValueError):
                    pass
            elif stripped.startswith('Molecular Mass:'):
                try:
                    qcdata.molecular_mass = float(stripped.split(':')[1].split()[0])
                except (IndexError, ValueError):
                    pass
            elif stripped.startswith('Eigenvalues --'):
                try:
                    eigs_bohr2 = [float(x) for x in stripped.split('--', 1)[1].split()][:3]
                    eigs_a2 = [e * _BOHR2_TO_ANGSTROM2 for e in eigs_bohr2]
                    nonzero = [e for e in eigs_a2 if e > 1e-6]
                    if len(nonzero) == 2:  # linear
                        qcdata.linear_mol = True
                        B_cm = 16.857629 / nonzero[-1]
                        qcdata.roconst = [B_cm * 29.9792458] * 3
                        qcdata.rotemp = [HC_OVER_KB * B_cm] * 3
                    elif len(nonzero) == 3:
                        roconst_cm = [16.857629 / I for I in nonzero]  # noqa: E741
                        qcdata.roconst = [b * 29.9792458 for b in roconst_cm]
                        qcdata.rotemp = [HC_OVER_KB * b for b in roconst_cm]
                except (IndexError, ValueError):
                    pass
            elif 'Rotational Symmetry Number is' in stripped:
                try:
                    qcdata.symmno = int(stripped.split('is')[1].split()[0])
                except (IndexError, ValueError):
                    pass

    # CPU time: "Total job time:  1.70s(wall), 21.27s(cpu)" — take the (cpu)
    # value (the wall × ncores estimate Q-Chem prints) so the sum matches
    # the Gaussian/NWChem/xTB convention of true summed CPU.
    for line in reversed(output):
        if 'Total job time:' in line:
            try:
                cpu_s = float(line.split(',')[1].split('s(cpu)')[0].strip())
                days = int(cpu_s // 86400)
                rem = cpu_s - days * 86400
                hours = int(rem // 3600)
                rem -= hours * 3600
                mins = int(rem // 60)
                rem -= mins * 60
                secs = int(rem)
                ms = int(round((rem - secs) * 1000))
                qcdata.cpu = [days, hours, mins, secs, ms]
            except (IndexError, ValueError):
                pass
            break

    # Job type from observed content.
    has_freq = bool(qcdata.frequency_wn) or bool(qcdata.im_frequency_wn)
    has_geom_block = last_geom_header >= 0
    if qcdata.im_frequency_wn:
        qcdata.job_type = 'TS'
    elif has_geom_block and has_freq:
        qcdata.job_type = 'GSFreq'
    elif has_freq:
        qcdata.job_type = 'Freq'
    else:
        qcdata.job_type = 'SP'

    _fill_mass_and_rotemp_from_geometry(qcdata)

    return qcdata



# Regex matching extxyz comment-line tokens: bare key=value or key="value with spaces"
_EXTXYZ_TOKEN = re.compile(r'(\w+)=("(?:[^"\\]|\\.)*"|\S+)')


def _parse_extxyz_comment(line):
    """Parse an extxyz comment line into a {key: str} dict.

    Strips surrounding double quotes from quoted values; leaves unquoted values
    untouched. Caller is responsible for type coercion.
    """
    out = {}
    for key, val in _EXTXYZ_TOKEN.findall(line):
        if len(val) >= 2 and val[0] == '"' and val[-1] == '"':
            val = val[1:-1]
        out[key] = val
    return out


def _extxyz_bool(s):
    """Decode an extxyz boolean (T/F/True/False) to a Python bool."""
    return str(s).strip().lower() in ('t', 'true', '1', 'yes')



[docs]
def parse_ase_thermo(file):
    """Parse a GoodVibes ASE thermo extxyz file.

    The format is standard extended XYZ with thermochemistry metadata in the
    second line (key=value, ASE Atoms.info convention). Required keys:
    ``program=ase``, ``scf_energy``. Frequencies are space-separated in cm-1
    (negatives are imaginary). See tests/ase/README.md for the full spec.
    """
    qcdata = QCData(file=file, program='ase')

    with open(file, encoding='utf-8', errors='replace') as f:
        lines = f.readlines()
    if len(lines) < 2:
        return qcdata
    try:
        natoms = int(lines[0].strip())
    except ValueError:
        return qcdata

    info = _parse_extxyz_comment(lines[1])

    # Provenance
    qcdata.version_program = info.get('ase_version', 'ase')

    # Charge / multiplicity (defaults: 0 / 1)
    try:
        qcdata.charge = int(float(info.get('charge', '0')))
    except ValueError:
        qcdata.charge = 0
    try:
        qcdata.multiplicity = int(float(info.get('multiplicity', '1')))
    except ValueError:
        qcdata.multiplicity = 1

    # Electronic energy (Hartree by default; eV converted)
    if 'scf_energy' in info:
        try:
            e = float(info['scf_energy'])
            units = info.get('scf_energy_units', 'Hartree').lower()
            if units in ('ev',):
                e = e / 27.211386245988
            elif units in ('kcal/mol', 'kcalmol'):
                e = e / 627.509541
            elif units in ('kj/mol', 'kjmol'):
                e = e / 2625.4996394799
            qcdata.scf_energy = e
        except ValueError:
            pass

    # Zero-point energy (Hartree)
    if 'zpe' in info:
        try:
            qcdata.zero_point_corr = float(info['zpe'])
        except ValueError:
            pass

    # Frequencies (cm-1; negatives are imaginary)
    if 'frequencies' in info:
        for tok in info['frequencies'].split():
            try:
                v = float(tok)
            except ValueError:
                continue
            if v < 0:
                qcdata.im_frequency_wn.append(v)
            else:
                qcdata.frequency_wn.append(v)

    # Optional model-chemistry metadata
    qcdata.solvation_model = info.get('solvation_model', '')
    qcdata.empirical_dispersion = info.get('empirical_dispersion', '')
    qcdata.point_group = info.get('point_group', '')
    if 'symmno' in info:
        try:
            qcdata.symmno = int(float(info['symmno']))
        except ValueError:
            pass
    if 'linear_mol' in info:
        qcdata.linear_mol = _extxyz_bool(info['linear_mol'])
    if 'applied_freq_scale_factor' in info:
        try:
            qcdata.applied_freq_scale_factor = float(info['applied_freq_scale_factor'])
        except ValueError:
            pass

    # Atom block: species, x, y, z (additional columns ignored)
    atom_nums, atom_types, cartesians = [], [], []
    for line in lines[2:2 + natoms]:
        parts = line.split()
        if len(parts) < 4:
            continue
        elem = parts[0]
        try:
            xyz = [float(parts[1]), float(parts[2]), float(parts[3])]
        except ValueError:
            continue
        atom_types.append(elem)
        atom_nums.append(element_id(elem, num=True))
        cartesians.append(xyz)
    qcdata.atom_types = atom_types
    qcdata.atom_nums = atom_nums
    qcdata.cartesians = cartesians

    # Molecular mass: explicit override, else compute from atomic masses
    if 'molecular_mass' in info:
        try:
            qcdata.molecular_mass = float(info['molecular_mass'])
        except ValueError:
            pass
    if not qcdata.molecular_mass and atom_types:
        qcdata.molecular_mass = sum(ATOMIC_MASSES.get(a, 0.0) for a in atom_types)

    # Rotational constants: explicit override (cm-1), else derived from inertia
    HC_OVER_KB = 1.4387768775  # K per cm⁻¹
    if 'roconst_cm' in info:
        try:
            roconst_cm = [float(t) for t in info['roconst_cm'].split()][:3]
            qcdata.roconst = [b * 29.9792458 for b in roconst_cm]
            qcdata.rotemp = [HC_OVER_KB * b for b in roconst_cm]
        except ValueError:
            pass
    elif 'rotemp' in info:
        try:
            qcdata.rotemp = [float(t) for t in info['rotemp'].split()][:3]
            qcdata.roconst = [t / HC_OVER_KB * 29.9792458 for t in qcdata.rotemp]
        except ValueError:
            pass
    if not any(qcdata.rotemp) and len(atom_types) >= 2:
        _mass, eigvals = _principal_moments_of_inertia(atom_types, cartesians)
        # Linear molecules: smallest principal moment is ~0.
        nonzero = [e for e in eigvals if e > 1e-3]
        if not qcdata.linear_mol and len(nonzero) == 2:
            qcdata.linear_mol = True
        if qcdata.linear_mol:
            # Use the largest two equal moments; physics needs a single B.
            B_cm = 16.857629 / nonzero[-1]  # cm⁻¹
            qcdata.roconst = [B_cm * 29.9792458] * 3
            qcdata.rotemp = [HC_OVER_KB * B_cm] * 3
        elif len(nonzero) == 3:
            roconst_cm = [16.857629 / I for I in nonzero]  # noqa: E741
            qcdata.roconst = [b * 29.9792458 for b in roconst_cm]
            qcdata.rotemp = [HC_OVER_KB * b for b in roconst_cm]

    # Job type: explicit override, else infer from frequency presence/sign
    if 'job_type' in info:
        qcdata.job_type = info['job_type']
    else:
        if qcdata.im_frequency_wn:
            qcdata.job_type = 'TS'
        elif qcdata.frequency_wn:
            qcdata.job_type = 'Freq'
        else:
            qcdata.job_type = 'SP'

    return qcdata




[docs]
def parse_qcdata(file):
    """Parse any supported output file into a QCData object.

    Detects program from file content and delegates to the correct parser.

    Parameters
    ----------
    file : str
        Path to quantum chemistry output file.
    """
    stub = os.path.splitext(file)[0]
    possible_filenames = (stub + '.log', stub + '.out', stub + '.extxyz', file)
    data = None
    actual_file = file
    for possible_filename in possible_filenames:
        if os.path.exists(possible_filename):
            actual_file = possible_filename
            with open(possible_filename, encoding='utf-8', errors='replace') as f:
                data = f.readlines()
            break

    if data is None:
        return QCData(file=file, program='unknown')

    # Detect program from first ~80 lines (Q-Chem banner is past the SLURM preamble)
    program = 'unknown'
    for line in data[:80]:
        if 'Gaussian' in line:
            program = 'Gaussian'
            break
        if '* O   R   C   A *' in line:
            program = 'Orca'
            break
        if 'NWChem' in line:
            program = 'NWChem'
            break
        if 'x T B' in line or 'xtb version' in line:
            program = 'xtb'
            break
        if 'You are running Q-Chem' in line or 'Welcome to Q-Chem' in line:
            program = 'QChem'
            break
    # ASE extxyz: program=ase appears in the comment line (2nd line).
    if program == 'unknown' and len(data) >= 2 and 'program=ase' in data[1]:
        program = 'ase'

    if program == 'Gaussian':
        return parse_gaussian_thermo(actual_file)
    elif program == 'Orca':
        return parse_orca_thermo(actual_file)
    elif program == 'NWChem':
        return parse_nwchem_thermo(actual_file)
    elif program == 'xtb':
        return parse_xtb_thermo(actual_file)
    elif program == 'QChem':
        return parse_qchem_thermo(actual_file)
    elif program == 'ase':
        return parse_ase_thermo(actual_file)
    else:
        return QCData(file=file, program='unknown')