Source code for goodvibes.io

# -*- coding: utf-8 -*-
import json
import os.path
import re
from dataclasses import asdict, dataclass, field
from datetime import datetime
from typing import List, Optional

import numpy as np



[docs] @dataclass class QCData: """Program-agnostic container for parsed quantum chemistry data. Populated by parse_qcdata() in io.py. Consumed by calc_bbe in thermo.py. """ # Provenance file: str = '' program: str = '' version_program: str = '' job_type: str = '' # Electronic structure scf_energy: Optional[float] = None charge: Optional[int] = None multiplicity: int = 1 # Model chemistry metadata solvation_model: str = '' empirical_dispersion: str = '' # Molecular properties molecular_mass: float = 0.0 symmno: int = 1 linear_mol: bool = False point_group: str = '' # Rotational data roconst: List[float] = field(default_factory=lambda: [0.0, 0.0, 0.0]) rotemp: List[float] = field(default_factory=lambda: [0.0, 0.0, 0.0]) linear_warning: bool = False # Frequencies (raw from parser — positive and negative separated) frequency_wn: List[float] = field(default_factory=list) im_frequency_wn: List[float] = field(default_factory=list) # Thermal corrections zero_point_corr: Optional[float] = None # CPU time [days, hours, mins, secs, msecs] cpu: List[int] = field(default_factory=lambda: [0, 0, 0, 0, 0]) # Number of MPI processes / cores reported by the QC program. Used by # the ORCA parser to convert TOTAL RUN TIME (wall) into an effective # CPU-time estimate (wall × nprocs); 1 elsewhere. nprocs: int = 1 # ONIOM MM frequency scaling fractions (per-frequency, empty if not ONIOM) fract_modelsys: List[float] = field(default_factory=list) has_oniom: bool = False # Molecular geometry atom_nums: List[int] = field(default_factory=list) atom_types: List[str] = field(default_factory=list) cartesians: List[List[float]] = field(default_factory=list) # Frequency scaling factor applied by QC program (ORCA only; 1.0 means unscaled) applied_freq_scale_factor: float = 1.0 # Single-point-correction (SPC) cache. Populated by calc_bbe when --spc # is used so the SPC file isn't re-parsed on subsequent --import runs. # `sp_suffix` is the --spc value that produced the cached numbers; on # a re-run with the same suffix the cache is reused, on a different # suffix it's bypassed and refreshed. `sp_file` is the resolved # absolute path of the SPC output (diagnostic only). sp_energy: Optional[float] = None sp_version_program: str = '' sp_solvation_model: str = '' sp_charge: Optional[int] = None sp_empirical_dispersion: str = '' sp_multiplicity: Optional[int] = None sp_suffix: str = '' sp_file: str = ''
# Cache version for JSON serialization format CACHE_VERSION = 1
[docs] def qcdata_to_dict(qcdata): """Convert a QCData instance to a JSON-serializable dictionary.""" d = asdict(qcdata) d['_cache_version'] = CACHE_VERSION return d
[docs] def dict_to_qcdata(d): """Reconstruct a QCData instance from a dictionary.""" clean = {k: v for k, v in d.items() if not k.startswith('_')} return QCData(**clean)
[docs] def save_cache(cache_dict, path): """Write QCData cache to a JSON file. Parameters ---------- cache_dict : dict Mapping of {basename_key: qcdata_dict} where each value is from qcdata_to_dict(). path : str Output JSON file path. """ envelope = { '_cache_version': CACHE_VERSION, '_created': datetime.now().isoformat(), 'entries': cache_dict, } with open(path, 'w') as f: json.dump(envelope, f, indent=2)
[docs] def load_cache(path): """Load a QCData cache from a JSON file. Auto-detects the format. v1.0+ payloads (the unified schema written by `--export` / `--json`) are read by extracting each result's `qcdata` block; the legacy cache envelope (`{_cache_version, entries}`, written by pre-v5.0 `--cache-save`) is also accepted for back-compat. Parameters ---------- path : str Path to JSON file (either v1.0 unified schema or legacy cache). Returns ------- dict Mapping of {basename_key: QCData}. """ with open(path, 'r') as f: data = json.load(f) if 'schema_version' in data: # Unified v1.0+ payload — use each result's qcdata block. from .schema import validate validate(data) cache = {} for entry in data.get('results', []): qc = entry.get('qcdata') if qc is None: continue name = entry.get('name') if name is None: # Fall back to deriving from the file path. fp = entry.get('file', '') name = os.path.splitext(os.path.basename(fp))[0] cache[name] = dict_to_qcdata(qc) return cache # Legacy envelope: {_cache_version, _created, entries} version = data.get('_cache_version', 0) if version != CACHE_VERSION: raise ValueError( "Cache version mismatch: expected {} (legacy) or a v1.0+ " "schema_version, got _cache_version={}".format(CACHE_VERSION, version) ) return {key: dict_to_qcdata(val) for key, val in data['entries'].items()}
# PHYSICAL CONSTANTS UNITS KCAL_TO_AU = 627.509541 # UNIT CONVERSION # Radii used to determine connectivity in symmetry corrections # Covalent radii taken from Cambridge Structural Database RADII = {'H': 0.32, 'He': 0.93, 'Li': 1.23, 'Be': 0.90, 'B': 0.82, 'C': 0.77, 'N': 0.75, 'O': 0.73, 'F': 0.72, 'Ne': 0.71, 'Na': 1.54, 'Mg': 1.36, 'Al': 1.18, 'Si': 1.11, 'P': 1.06, 'S': 1.02, 'Cl': 0.99, 'Ar': 0.98, 'K': 2.03, 'Ca': 1.74, 'Sc': 1.44, 'Ti': 1.32, 'V': 1.22, 'Cr': 1.18, 'Mn': 1.17, 'Fe': 1.17, 'Co': 1.16, 'Ni': 1.15, 'Cu': 1.17, 'Zn': 1.25, 'Ga': 1.26, 'Ge': 1.22, 'As': 1.20, 'Se': 1.16, 'Br': 1.14, 'Kr': 1.12, 'Rb': 2.16, 'Sr': 1.91, 'Y': 1.62, 'Zr': 1.45, 'Nb': 1.34, 'Mo': 1.30, 'Tc': 1.27, 'Ru': 1.25, 'Rh': 1.25, 'Pd': 1.28, 'Ag': 1.34, 'Cd': 1.48, 'In': 1.44, 'Sn': 1.41, 'Sb': 1.40, 'Te': 1.36, 'I': 1.33, 'Xe': 1.31, 'Cs': 2.35, 'Ba': 1.98, 'La': 1.69, 'Lu': 1.60, 'Hf': 1.44, 'Ta': 1.34, 'W': 1.30, 'Re': 1.28, 'Os': 1.26, 'Ir': 1.27, 'Pt': 1.30, 'Au': 1.34, 'Hg': 1.49, 'Tl': 1.48, 'Pb': 1.47, 'Bi': 1.46, 'X': 0} # Bondi van der Waals radii for all atoms from: Bondi, A. J. Phys. Chem. 1964, 68, 441-452, # except hydrogen, which is taken from Rowland, R. S.; Taylor, R. J. Phys. Chem. 1996, 100, 7384-7391. # Radii unavailable in either of these publications are set to 2 Angstrom # (Unfinished) BONDI = {'H': 1.09, 'He': 1.40, 'Li': 1.82, 'Be': 2.00, 'B': 2.00, 'C': 1.70, 'N': 1.55, 'O': 1.52, 'F': 1.47, 'Ne': 1.54} # Some useful arrays periodictable = ["", "H", "He", "Li", "Be", "B", "C", "N", "O", "F", "Ne", "Na", "Mg", "Al", "Si", "P", "S", "Cl", "Ar", "K", "Ca", "Sc", "Ti", "V", "Cr", "Mn", "Fe", "Co", "Ni", "Cu", "Zn", "Ga", "Ge", "As", "Se", "Br", "Kr", "Rb", "Sr", "Y", "Zr", "Nb", "Mo", "Tc", "Ru", "Rh", "Pd", "Ag", "Cd", "In", "Sn", "Sb", "Te", "I", "Xe", "Cs", "Ba", "La", "Ce", "Pr", "Nd", "Pm", "Sm", "Eu", "Gd", "Tb", "Dy", "Ho", "Er", "Tm", "Yb", "Lu", "Hf", "Ta", "W", "Re", "Os", "Ir", "Pt", "Au", "Hg", "Tl", "Pb", "Bi", "Po", "At", "Rn", "Fr", "Ra", "Ac", "Th", "Pa", "U", "Np", "Pu", "Am", "Cm", "Bk", "Cf", "Es", "Fm", "Md", "No", "Lr", "Rf", "Db", "Sg", "Bh", "Hs", "Mt", "Ds", "Rg", "Uub", "Uut", "Uuq", "Uup", "Uuh", "Uus", "Uuo"]
[docs] def element_id(massno, num=False): """ Get element symbol from mass number. Used in parsing output files to determine elements present in file. Parameter: massno (int): mass of element. Returns: str: element symbol, or 'XX' if not found in periodic table. """ try: if num: return periodictable.index(massno) return periodictable[massno] except IndexError: return "XX"
# Standard atomic weights (IUPAC 2021), most common isotope where applicable. # Used by parse_ase_thermo when an extxyz fixture omits molecular_mass / rotational # constants and they must be computed from the geometry. ATOMIC_MASSES = { 'H': 1.00782503207, 'He': 4.002602, 'Li': 7.0160034366, 'Be': 9.012183065, 'B': 11.00930536, 'C': 12.0, 'N': 14.0030740048, 'O': 15.99491461957, 'F': 18.998403163, 'Ne': 19.9924401762, 'Na': 22.9897692820, 'Mg': 23.985041697, 'Al': 26.98153853, 'Si': 27.97692653465, 'P': 30.97376199842, 'S': 31.9720711744, 'Cl': 34.968852682, 'Ar': 39.9623831237, 'K': 38.96370648, 'Ca': 39.96259098, 'Sc': 44.95590828, 'Ti': 47.94794198, 'V': 50.94395704, 'Cr': 51.94050623, 'Mn': 54.93804451, 'Fe': 55.93493633, 'Co': 58.93319429, 'Ni': 57.93534241, 'Cu': 62.92959772, 'Zn': 63.92914201, 'Ga': 68.9255735, 'Ge': 73.921177761, 'As': 74.92159457, 'Se': 79.9165218, 'Br': 78.9183376, 'Kr': 83.9114977282, 'Rb': 84.91178974, 'Sr': 87.9056125, 'Y': 88.9058403, 'Zr': 89.9046977, 'Nb': 92.906373, 'Mo': 97.90540482, 'Tc': 98.9062508, 'Ru': 101.9043441, 'Rh': 102.905498, 'Pd': 105.9034804, 'Ag': 106.9050916, 'Cd': 113.90336509, 'In': 114.903878776, 'Sn': 119.90220163, 'Sb': 120.903812, 'Te': 129.906222748, 'I': 126.9044719, 'Xe': 131.9041550856, } def _principal_moments_of_inertia(atom_types, cartesians): """Return (total_mass [amu], [I_a, I_b, I_c] in amu*Angstrom^2). Used as a fallback when an extxyz fixture omits molecular_mass / roconst / rotemp; computes them from the geometry using ATOMIC_MASSES. """ masses = np.array([ATOMIC_MASSES.get(a, 0.0) for a in atom_types]) coords = np.array(cartesians, dtype=float) total = float(masses.sum()) if total <= 0.0 or len(coords) == 0: return total, np.array([0.0, 0.0, 0.0]) com = (masses[:, None] * coords).sum(axis=0) / total r = coords - com Ixx = float((masses * (r[:, 1] ** 2 + r[:, 2] ** 2)).sum()) Iyy = float((masses * (r[:, 0] ** 2 + r[:, 2] ** 2)).sum()) Izz = float((masses * (r[:, 0] ** 2 + r[:, 1] ** 2)).sum()) Ixy = float(-(masses * r[:, 0] * r[:, 1]).sum()) Ixz = float(-(masses * r[:, 0] * r[:, 2]).sum()) Iyz = float(-(masses * r[:, 1] * r[:, 2]).sum()) inertia = np.array([[Ixx, Ixy, Ixz], [Ixy, Iyy, Iyz], [Ixz, Iyz, Izz]]) eigvals = np.linalg.eigvalsh(inertia) eigvals = np.sort(np.maximum(eigvals, 0.0)) return total, eigvals def _fill_mass_and_rotemp_from_geometry(qcdata): """Populate molecular_mass and rotational constants from geometry. Fallback used when a QC program omits the THERMOCHEMISTRY block (e.g. Gaussian freq-without-opt, ORCA --hess-only) — recomputes from atom_types + cartesians via ATOMIC_MASSES so that downstream translational/rotational entropy doesn't see mass=0 or rotemp=[0,0,0]. No-op when the parser already populated the fields. """ if not (qcdata.atom_types and qcdata.cartesians): return needs_mass = qcdata.molecular_mass == 0.0 needs_rotemp = qcdata.rotemp == [0.0, 0.0, 0.0] if not (needs_mass or needs_rotemp): return total_mass, eigvals = _principal_moments_of_inertia( qcdata.atom_types, qcdata.cartesians) if needs_mass: qcdata.molecular_mass = total_mass if needs_rotemp: HC_OVER_KB = 1.4387768775 # K per cm⁻¹ roconst_cm = [16.857629 / I if I > 1e-3 else 0.0 for I in eigvals] qcdata.roconst = [b * 29.9792458 for b in roconst_cm] qcdata.rotemp = [HC_OVER_KB * b for b in roconst_cm]
[docs] def compute_connectivity(atom_types, cartesians, tolerance=0.2): """Compute molecular connectivity based on covalent radii.""" connectivity = [] for i, ai in enumerate(atom_types): row = [] for j, aj in enumerate(atom_types): if i == j: continue cutoff = RADII[ai] + RADII[aj] + tolerance distance = np.linalg.norm(np.array(cartesians[i]) - np.array(cartesians[j])) if distance < cutoff: row.append(j) connectivity.append(row) return connectivity
[docs] def write_xyz(filepath, files, thermo_data): """Write optimized Cartesian coordinates to a multi-structure .xyz file. Each structure block contains the atom count, a comment line with the filename and SCF energy, and the Cartesian coordinates. Parameters: filepath (str): output .xyz file path. files (list): output file paths whose coordinates to write. thermo_data (dict): file path → calc_bbe mapping. """ from .utils import display_name with open(filepath, 'w') as f: for file in files: bbe = thermo_data[file] f.write(f'{len(bbe.atom_types)}\n') if bbe.scf_energy is not None: f.write(f'{display_name(file):<39} {"Eopt":>13} {bbe.scf_energy:.6f}\n') else: f.write(f'{display_name(file):<39}\n') if bbe.atom_types and bbe.cartesians: for atom, carts in zip(bbe.atom_types, bbe.cartesians): f.write(f'{atom:>1}{carts[0]:13.6f}{carts[1]:13.6f}{carts[2]:13.6f}\n')
[docs] def find_spc_file(name, spc): """Locate the single-point-correction output paired with ``name``. Pattern matched: ``{name}_{spc}.{log,out}`` — exact match. ``--spc TZVP`` finds ``filename_TZVP.log`` only; it will NOT match ``filename_def2_TZVP.log``. To pair with a file like ``filename_def2_TZVP.log``, pass the full suffix (``--spc def2_TZVP``). Parameters ---------- name : str Path stem of the parent output file (no extension). spc : str Suffix passed via ``--spc``. Returns ------- str or None Path to the matching file, or None if nothing matches. """ for ext in ('.log', '.out'): candidate = f'{name}_{spc}{ext}' if os.path.isfile(candidate): return candidate return None
[docs] def parse_data(file): """ Read computational chemistry output file. Attempt to obtain single point energy, program type, program version, solvation_model, charge, empirical_dispersion, and multiplicity from file. Parameter: file (str): name of file to be parsed. Returns: float: single point energy. str: program used to run calculation. str: version of program used to run calculation. str: solvation model used in chemical calculation (if any). str: original filename parsed. int: overall charge of molecule or chemical system. str: empirical dispersion used in chemical calculation (if any). int: multiplicity of molecule or chemical system. """ spe, program, data, version_program, solvation_model, keyword_line, a, charge, multiplicity = 'none', 'none', [], '', '', '', 0, None, None data = None stub = os.path.splitext(file)[0] possible_filenames = (stub + ".log", stub + ".out", stub + ".extxyz", file) actual_file = file for possible_filename in possible_filenames: if os.path.exists(possible_filename): actual_file = possible_filename with open(possible_filename, encoding='utf-8', errors='replace') as f: data = f.readlines() break if data is None: raise ValueError("File {} does not exist".format(file)) # ASE extxyz: delegate to parse_ase_thermo and translate to parse_data's # tuple shape; ASE files have no separate single-point semantics. if len(data) >= 2 and 'program=ase' in data[1]: q = parse_ase_thermo(actual_file) return (q.scf_energy if q.scf_energy is not None else 'none', 'ase', q.version_program, q.solvation_model or 'gas phase', file, q.charge if q.charge is not None else 0, q.empirical_dispersion or 'No empirical dispersion detected', q.multiplicity) # Q-Chem: delegate to parse_qchem_thermo (Q-Chem energy lines, MP2/CCSD # precedence, and $molecule echo are all already handled there). if any('You are running Q-Chem' in ln or 'Welcome to Q-Chem' in ln for ln in data[:80]): q = parse_qchem_thermo(actual_file) return (q.scf_energy if q.scf_energy is not None else 'none', 'QChem', q.version_program, q.solvation_model or 'gas phase', file, q.charge if q.charge is not None else 0, q.empirical_dispersion or 'No empirical dispersion detected', q.multiplicity) for line in data: if "Gaussian" in line: program = "Gaussian" break if "* O R C A *" in line: program = "Orca" break if "NWChem" in line: program = "NWChem" break if "x T B" in line or "xtb version" in line: program = "xtb" break repeated_link1 = 0 freq_started = False # Guard against VPT2 displaced geometry energies zero_point_corr_G4 = 0.0 for line in data: if program == "Gaussian": # Reset freq_started at each new link (linked jobs have separate links) if 'Normal termination' in line: freq_started = False if line.strip().startswith('Frequencies -- '): freq_started = True if not freq_started and line.strip().startswith('SCF Done:'): spe = float(line.strip().split()[4]) elif not freq_started and line.strip().startswith('E2('): spe_value = line.strip().split()[-1] spe = float(spe_value.replace('D','E')) elif not freq_started and 'EUMP2 =' in line.strip(): spe = float((line.strip().split()[5]).replace('D', 'E')) elif 'CCSD(T)=' in line.strip(): raw = line.strip().split('CCSD(T)=')[1].split('\\')[0].split()[0] spe = float(raw.replace('D', 'E')) elif 'CCSD=' in line.strip() and 'CCSD(T)' not in line.strip(): raw = line.strip().split('CCSD=')[1].split('\\')[0].split()[0] spe = float(raw.replace('D', 'E')) elif line.strip().startswith('Counterpoise corrected energy'): spe = float(line.strip().split()[4]) # For ONIOM calculations use the extrapolated value rather than SCF value elif "ONIOM: extrapolated energy" in line.strip(): spe = (float(line.strip().split()[4])) # For G4 calculations look for G4 energies (Gaussian16a bug prints G4(0 K) as DE(HF)) --Brian modified to work for G16c-where bug is fixed. elif line.strip().startswith('G4(0 K)'): spe = float(line.strip().split()[2]) spe -= zero_point_corr_G4 #Remove G4 ZPE elif line.strip().startswith('E(ZPE)='): #Get G4 ZPE zero_point_corr_G4 = float(line.strip().split()[1]) # For TD calculations look for SCF energies of the first excited state elif 'E(TD-HF/TD-DFT)' in line.strip(): spe = float(line.strip().split()[4]) # For Semi-empirical or Molecular Mechanics calculations elif "Energy= " in line.strip() and "Predicted" not in line.strip() and "Thermal" not in line.strip() and "G4" not in line.strip(): spe = (float(line.strip().split()[1])) elif "Gaussian" in line and "Revision" in line and repeated_link1 == 0: for i in range(len(line.strip(",").split(",")) - 1): line.strip(",").split(",")[i] version_program += line.strip(",").split(",")[i] repeated_link1 = 1 version_program = version_program[1:] # Charge and multiplicity elif 'Charge' in line and 'Multiplicity' in line: try: parts = line.split() charge = int(parts[parts.index('Charge') + 2]) multiplicity = int(parts[parts.index('Multiplicity') + 2]) except (ValueError, IndexError): pass elif program == "Orca": if 'Program Version' in line.strip(): version_program = "ORCA version " + line.split()[2] if line.strip().startswith('FINAL SINGLE POINT ENERGY'): spe = float(line.strip().split()[-1]) if "Total Charge" in line.strip() and "...." in line.strip(): charge = int(line.strip("=").split()[-1]) if "Multiplicity" in line.strip() and "...." in line.strip(): multiplicity = int(line.strip("=").split()[-1]) elif program == "NWChem": if 'nwchem branch' in line.strip(): version_program = "NWChem version " + line.split()[3] if line.strip().startswith('Total DFT energy'): spe = float(line.strip().split()[4]) if "charge" in line.strip(): charge = int(line.strip().split()[-1]) if "mult " in line.strip(): multiplicity = int(line.strip().split()[-1]) elif "Spin multiplicity:" in line: try: multiplicity = int(line.split(":", 1)[1].strip().split()[0]) except (ValueError, IndexError): pass elif program == "xtb": stripped = line.strip() if 'xtb version' in stripped and not version_program: parts = stripped.split() try: version_program = "xtb version " + parts[parts.index('version') + 1] except (ValueError, IndexError): pass if stripped.startswith(':: total energy'): parts = stripped.split() try: spe = float(parts[3]) except (ValueError, IndexError): pass if 'net charge' in stripped and stripped.startswith(':'): parts = stripped.split() try: charge = int(parts[parts.index('charge') + 1]) except (ValueError, IndexError): pass if 'unpaired electrons' in stripped and stripped.startswith(':'): parts = stripped.split() try: multiplicity = int(parts[parts.index('electrons') + 1]) + 1 except (ValueError, IndexError): pass # Solvation model and empirical dispersion detection if 'Gaussian' in version_program.strip(): for i, line in enumerate(data): if '#' in line.strip() and a == 0: for j, line in enumerate(data[i:i + 10]): if '--' in line.strip(): a = a + 1 break if a != 0: break else: for k in range(len(line.strip().split("\n"))): line.strip().split("\n")[k] keyword_line += line.strip().split("\n")[k] keyword_line = keyword_line.lower() if 'scrf' not in keyword_line.strip(): solvation_model = "gas phase" else: start_scrf = keyword_line.strip().find('scrf') + 4 if '(' in keyword_line[start_scrf:start_scrf + 4]: start_scrf += keyword_line[start_scrf:start_scrf + 4].find('(') + 1 end_scrf = keyword_line.find(")", start_scrf) display_solvation_model = "scrf=(" + ','.join( keyword_line[start_scrf:end_scrf].lower().split(',')) + ')' sorted_solvation_model = "scrf=(" + ','.join( sorted(keyword_line[start_scrf:end_scrf].lower().split(','))) + ')' else: if ' = ' in keyword_line[start_scrf:start_scrf + 4]: start_scrf += keyword_line[start_scrf:start_scrf + 4].find(' = ') + 3 elif ' =' in keyword_line[start_scrf:start_scrf + 4]: start_scrf += keyword_line[start_scrf:start_scrf + 4].find(' =') + 2 elif '=' in keyword_line[start_scrf:start_scrf + 4]: start_scrf += keyword_line[start_scrf:start_scrf + 4].find('=') + 1 end_scrf = keyword_line.find(" ", start_scrf) if end_scrf == -1: display_solvation_model = "scrf=(" + ','.join(keyword_line[start_scrf:].lower().split(',')) + ')' sorted_solvation_model = "scrf=(" + ','.join( sorted(keyword_line[start_scrf:].lower().split(','))) + ')' else: display_solvation_model = "scrf=(" + ','.join( keyword_line[start_scrf:end_scrf].lower().split(',')) + ')' sorted_solvation_model = "scrf=(" + ','.join( sorted(keyword_line[start_scrf:end_scrf].lower().split(','))) + ')' if solvation_model != "gas phase": solvation_model = [sorted_solvation_model, display_solvation_model] empirical_dispersion = '' if keyword_line.strip().find('empiricaldispersion') == -1 and keyword_line.strip().find( 'emp=') == -1 and keyword_line.strip().find('emp =') == -1 and keyword_line.strip().find('emp(') == -1: empirical_dispersion = "No empirical dispersion detected" elif keyword_line.strip().find('empiricaldispersion') > -1: start_emp_disp = keyword_line.strip().find('empiricaldispersion') + 19 if '(' in keyword_line[start_emp_disp:start_emp_disp + 4]: start_emp_disp += keyword_line[start_emp_disp:start_emp_disp + 4].find('(') + 1 end_emp_disp = keyword_line.find(")", start_emp_disp) empirical_dispersion = 'empiricaldispersion=(' + ','.join( sorted(keyword_line[start_emp_disp:end_emp_disp].lower().split(','))) + ')' else: if ' = ' in keyword_line[start_emp_disp:start_emp_disp + 4]: start_emp_disp += keyword_line[start_emp_disp:start_emp_disp + 4].find(' = ') + 3 elif ' =' in keyword_line[start_emp_disp:start_emp_disp + 4]: start_emp_disp += keyword_line[start_emp_disp:start_emp_disp + 4].find(' =') + 2 elif '=' in keyword_line[start_emp_disp:start_emp_disp + 4]: start_emp_disp += keyword_line[start_emp_disp:start_emp_disp + 4].find('=') + 1 end_emp_disp = keyword_line.find(" ", start_emp_disp) if end_emp_disp == -1: empirical_dispersion = "empiricaldispersion=(" + ','.join( sorted(keyword_line[start_emp_disp:].lower().split(','))) + ')' else: empirical_dispersion = "empiricaldispersion=(" + ','.join( sorted(keyword_line[start_emp_disp:end_emp_disp].lower().split(','))) + ')' elif keyword_line.strip().find('emp=') > -1 or keyword_line.strip().find( 'emp =') > -1 or keyword_line.strip().find('emp(') > -1: # Check for temp keyword temp, emp_e, emp_p = False, False, False check_temp = keyword_line.strip().find('emp=') start_emp_disp = keyword_line.strip().find('emp=') if check_temp == -1: check_temp = keyword_line.strip().find('emp =') start_emp_disp = keyword_line.strip().find('emp =') if check_temp == -1: check_temp = keyword_line.strip().find('emp=(') start_emp_disp = keyword_line.strip().find('emp(') check_temp += -1 if keyword_line[check_temp].lower() == 't': temp = True # Look for a new one if keyword_line.strip().find('emp=', check_temp + 5) > -1: emp_e = True start_emp_disp = keyword_line.strip().find('emp=', check_temp + 5) + 3 elif keyword_line.strip().find('emp =', check_temp + 5) > -1: emp_e = True start_emp_disp = keyword_line.strip().find('emp =', check_temp + 5) + 3 elif keyword_line.strip().find('emp(', check_temp + 5) > -1: emp_p = True start_emp_disp = keyword_line.strip().find('emp(', check_temp + 5) + 3 else: empirical_dispersion = "No empirical dispersion detected" else: start_emp_disp += 3 if (temp and emp_e) or (not temp and keyword_line.strip().find('emp=') > -1) or ( not temp and keyword_line.strip().find('emp =')): if '(' in keyword_line[start_emp_disp:start_emp_disp + 4]: start_emp_disp += keyword_line[start_emp_disp:start_emp_disp + 4].find('(') + 1 end_emp_disp = keyword_line.find(")", start_emp_disp) empirical_dispersion = 'empiricaldispersion=(' + ','.join( sorted(keyword_line[start_emp_disp:end_emp_disp].lower().split(','))) + ')' else: if ' = ' in keyword_line[start_emp_disp:start_emp_disp + 4]: start_emp_disp += keyword_line[start_emp_disp:start_emp_disp + 4].find(' = ') + 3 elif ' =' in keyword_line[start_emp_disp:start_emp_disp + 4]: start_emp_disp += keyword_line[start_emp_disp:start_emp_disp + 4].find(' =') + 2 elif '=' in keyword_line[start_emp_disp:start_emp_disp + 4]: start_emp_disp += keyword_line[start_emp_disp:start_emp_disp + 4].find('=') + 1 end_emp_disp = keyword_line.find(" ", start_emp_disp) if end_emp_disp == -1: empirical_dispersion = "empiricaldispersion=(" + ','.join( sorted(keyword_line[start_emp_disp:].lower().split(','))) + ')' else: empirical_dispersion = "empiricaldispersion=(" + ','.join( sorted(keyword_line[start_emp_disp:end_emp_disp].lower().split(','))) + ')' elif (temp and emp_p) or (not temp and keyword_line.strip().find('emp(') > -1): start_emp_disp += keyword_line[start_emp_disp:start_emp_disp + 4].find('(') + 1 end_emp_disp = keyword_line.find(")", start_emp_disp) empirical_dispersion = 'empiricaldispersion=(' + ','.join( sorted(keyword_line[start_emp_disp:end_emp_disp].lower().split(','))) + ')' if 'ORCA' in version_program.strip(): keyword_line_1 = "gas phase" keyword_line_2 = '' keyword_line_3 = '' for i, line in enumerate(data): if 'CPCM SOLVATION MODEL' in line.strip(): keyword_line_1 = "CPCM," if 'SMD CDS free energy correction energy' in line.strip(): keyword_line_2 = "SMD," if "Solvent: " in line.strip(): keyword_line_3 = line.strip().split()[-1] solvation_model = keyword_line_1 + keyword_line_2 + keyword_line_3 empirical_dispersion1 = 'No empirical dispersion detected' empirical_dispersion2 = '' empirical_dispersion3 = '' for i, line in enumerate(data): if keyword_line.strip().find('DFT DISPERSION CORRECTION') > -1: empirical_dispersion1 = '' if keyword_line.strip().find('DFTD3') > -1: empirical_dispersion2 = "D3" if keyword_line.strip().find('USING zero damping') > -1: empirical_dispersion3 = ' with zero damping' empirical_dispersion = empirical_dispersion1 + empirical_dispersion2 + empirical_dispersion3 if 'NWChem' in version_program.strip(): empirical_dispersion1 = 'No empirical dispersion detected' empirical_dispersion2 = '' empirical_dispersion3 = '' for i, line in enumerate(data): if keyword_line.strip().find('Dispersion correction') > -1: empirical_dispersion1 = '' if keyword_line.strip().find('disp vdw 3') > -1: empirical_dispersion2 = "D3" if keyword_line.strip().find('disp vdw 4') > -1: empirical_dispersion2 = "D3BJ" empirical_dispersion = empirical_dispersion1 + empirical_dispersion2 + empirical_dispersion3 if 'xtb' in version_program.strip(): solvation_model = 'gas phase' for line in data: stripped = line.strip() if 'program call' in stripped and ':' in stripped: solvation_model = _xtb_solvation_from_call(stripped.split(':', 1)[1].strip()) break empirical_dispersion = 'No empirical dispersion detected' return spe, program, version_program, solvation_model, file, charge, empirical_dispersion, multiplicity
[docs] def sp_cpu(file): """Read single-point output for CPU time. Delegates to ``parse_qcdata(file).cpu`` so the SPC CPU is identical to what a standalone parse of the same file would report. This guarantees that ``goodvibes <parents> --spc <suffix>`` totals correctly reflect the SPC files' CPU — including program-specific accumulation (Gaussian sums multi-step ``Job cpu time`` lines) and nprocs scaling (ORCA reports wall time only; the parser scales by parallel-MPI process count to estimate effective CPU). """ candidates = [ os.path.splitext(file)[0] + '.log', os.path.splitext(file)[0] + '.out', ] for cand in candidates: if os.path.exists(cand): return parse_qcdata(cand).cpu raise ValueError("File {} does not exist".format(file))
# Tokens on the ORCA ``!`` keyword line that are NOT method or basis set. _ORCA_SKIP_TOKENS = { # Job types 'OPT', 'OPTTS', 'FREQ', 'NUMFREQ', 'NUMHESS', 'ANFREQ', 'SP', 'NMR', 'VPT2', 'SCANTS', 'NEB-TS', 'FAST-NEB-TS', 'NEB-CI', # Solvation 'CPCM', 'SMD', # Dispersion (standalone keywords, not embedded in functional name) 'D3BJ', 'D3', 'D4', 'D3ZERO', # SCF convergence 'TIGHTSCF', 'NORMALSCF', 'LOOSESCF', 'VERYTIGHTSCF', 'EXTREMESCF', 'SLOWCONV', 'NOCONV', # RI approximations 'RIJCOSX', 'RI', 'RIJK', 'RIJONX', 'AUTOAUX', 'RIJDX', # Grid 'GRID4', 'GRID5', 'GRID6', 'GRID7', 'FINALGRID4', 'FINALGRID5', 'FINALGRID6', 'FINALGRID7', # Output verbosity 'PRINT', 'MINIPRINT', 'LARGEPRINT', 'PRINTBASIS', 'PRINTMOS', 'NOPRINT', # Reference (DFT) 'UKS', 'RKS', # Other 'NOFROZENCORE', 'FROZENCORE', 'MOREAD', 'XYZFILE', 'SCALFREQ', 'QUASIRRHO', } # Reference keywords that map to HF when no other method is present _ORCA_HF_REFS = {'UHF', 'RHF', 'ROHF'} # Regex for recognizing basis set tokens (case-insensitive matching) _BASIS_PATTERNS = [ re.compile(r'^\d+-\d+\+*G', re.IGNORECASE), # Pople: 6-31G, 6-311+G, etc. re.compile(r'^(AUG-)?CC-PV[DTQR56]Z$', re.IGNORECASE), # Dunning re.compile(r'^DEF2-', re.IGNORECASE), # Karlsruhe def2- re.compile(r'^(SV|TZV|QZV)P?P?$', re.IGNORECASE), # Ahlrichs without def2 re.compile(r'^SV\(P\)$', re.IGNORECASE), # SV(P) ] def _is_basis_set(token): """Return True if *token* looks like a basis set name.""" return any(pat.match(token) for pat in _BASIS_PATTERNS) def _is_auxiliary_basis(token): """Return True if *token* is an auxiliary basis set (e.g. cc-pVTZ/C, def2/J).""" upper = token.upper() if '/' in upper: suffix = upper.rsplit('/', 1)[1] if suffix in ('C', 'J', 'JK', 'JKFIT', 'CFIT'): return True # def2/J, def2/JK style if upper.startswith('DEF2/'): return True return False def _parse_orca_lot(data): """Extract (method, basis_set) from ORCA output file lines. Parses the ``!`` keyword line(s) in the ORCA input section and classifies tokens into method, basis set, or known keywords to skip. Falls back to the ``Your calculation utilizes the basis:`` line when no basis set is found on the ``!`` line. """ kw_tokens = [] fallback_basis = None for line in data: stripped = line.strip() # Collect tokens from ORCA input keyword lines: | N> ! ... # The ``!`` must be the first non-whitespace after ``>`` to avoid # matching ``!`` inside comments on other input lines. if '|' in line and '>' in stripped: after_angle = stripped.split('>', 1)[1].lstrip() if after_angle.startswith('!'): after_bang = after_angle.split('!', 1)[1] kw_tokens.extend(after_bang.split()) # Fallback basis from ORCA output section if 'Your calculation utilizes the basis:' in stripped and fallback_basis is None: fallback_basis = stripped.split(':')[-1].strip() # Classify tokens method = None basis = None hf_ref = None # Track UHF/RHF/ROHF in case it's the only method for token in kw_tokens: upper = token.upper() # Skip parallelization (pal4, pal8, pal16, ...) if re.match(r'^PAL\d+$', upper): continue # Skip known ORCA keywords if upper in _ORCA_SKIP_TOKENS: continue # Skip auxiliary basis sets if _is_auxiliary_basis(token): continue # Skip GCP(...) tokens if upper.startswith('GCP(') or upper == 'GCP': continue # Skip QM/QM2 compound job keyword if upper == 'QM/QM2': continue # Track HF reference keywords if upper in _ORCA_HF_REFS: hf_ref = 'HF' continue # Classify as basis set or method if _is_basis_set(token) and basis is None: basis = token elif method is None: method = token # Fall back: if only HF reference found and no other method if method is None and hf_ref is not None: method = hf_ref # Fall back to "utilizes the basis" line if basis is None and fallback_basis is not None: basis = fallback_basis return method or 'none', basis or 'none'
[docs] def level_of_theory(file): """Read output for the level of theory and basis set used.""" repeated_theory = 0 with open(file, encoding='utf-8', errors='replace') as f: data = f.readlines() level, bs = 'none', 'none' # Detect ORCA and use dedicated parser for line in data: if '* O R C A *' in line: level, bs = _parse_orca_lot(data) return '/'.join([level, bs]) if 'Gaussian' in line: break if 'NWChem' in line: break if 'x T B' in line or 'xtb version' in line: for ln in data: if 'Hamiltonian' in ln and 'xTB' in ln: parts = ln.split() for tok in parts: if 'xTB' in tok: return tok + '/none' return 'GFN2-xTB/none' for line in data: if line.strip().find('External calculation') > -1: level, bs = 'ext', 'ext' break if '\\Freq\\' in line.strip() and repeated_theory == 0: try: level, bs = (line.strip().split("\\")[4:6]) repeated_theory = 1 except IndexError: pass elif '|Freq|' in line.strip() and repeated_theory == 0: try: level, bs = (line.strip().split("|")[4:6]) repeated_theory = 1 except IndexError: pass if '\\SP\\' in line.strip() and repeated_theory == 0: try: level, bs = (line.strip().split("\\")[4:6]) repeated_theory = 1 except IndexError: pass elif '|SP|' in line.strip() and repeated_theory == 0: try: level, bs = (line.strip().split("|")[4:6]) repeated_theory = 1 except IndexError: pass if 'DLPNO BASED TRIPLES CORRECTION' in line.strip(): level = 'DLPNO-CCSD(T)' if 'Estimated CBS total energy' in line.strip(): try: bs = ("Extrapol." + line.strip().split()[4]) except IndexError: pass # Remove the restricted R or unrestricted U label if level[0] in ('R', 'U'): level = level[1:] level_of_theory = '/'.join([level, bs]) return level_of_theory
[docs] def read_initial(file): """At beginning of procedure, read level of theory, solvation model, and check for normal termination""" with open(file, encoding='utf-8', errors='replace') as f: data = f.readlines() level, bs, program, keyword_line = 'none', 'none', 'none', 'none' solvation_model = "gas phase" progress, orientation = 'Incomplete', 'Input' a, repeated_theory = 0, 0 no_grid = True dft_used = 'F' grid_lookup = {1: 'sg1', 2: 'coarse', 4: 'fine', 5: 'ultrafine', 7: 'superfine'} for line in data: # Determine program if "Gaussian" in line: program = "Gaussian" break if "* O R C A *" in line: program = "Orca" break if "NWChem" in line: program = "NWChem" break if "x T B" in line or "xtb version" in line: program = "xtb" break if "You are running Q-Chem" in line or "Welcome to Q-Chem" in line: program = "QChem" break # ASE extxyz: program=ase appears in line 2 (the comment line) if program == 'none' and len(data) >= 2 and 'program=ase' in data[1]: program = 'ase' for line in data: # Grab pertinent information from file if line.strip().find('External calculation') > -1: level, bs = 'ext', 'ext' if line.strip().find('Standard orientation:') > -1: orientation = 'Standard' if line.strip().find('IExCor=') > -1 and no_grid: try: dft_used = line.split('=')[2].split()[0] _ = grid_lookup[int(dft_used)] no_grid = False except (KeyError, ValueError, IndexError): pass if '\\Freq\\' in line.strip() and repeated_theory == 0: try: level, bs = (line.strip().split("\\")[4:6]) repeated_theory = 1 except IndexError: pass elif '|Freq|' in line.strip() and repeated_theory == 0: try: level, bs = (line.strip().split("|")[4:6]) repeated_theory = 1 except IndexError: pass if '\\SP\\' in line.strip() and repeated_theory == 0: try: level, bs = (line.strip().split("\\")[4:6]) repeated_theory = 1 except IndexError: pass elif '|SP|' in line.strip() and repeated_theory == 0: try: level, bs = (line.strip().split("|")[4:6]) repeated_theory = 1 except IndexError: pass if 'DLPNO BASED TRIPLES CORRECTION' in line.strip(): level = 'DLPNO-CCSD(T)' if 'Estimated CBS total energy' in line.strip(): try: bs = ("Extrapol." + line.strip().split()[4]) except IndexError: pass # Remove the restricted R or unrestricted U label if level[0] in ('R', 'U'): level = level[1:] #NWChem specific parsing if program == 'NWChem': keyword_line_1 = "gas phase" keyword_line_2 = '' keyword_line_3 = '' for i, line in enumerate(data): if line.strip().startswith("xc "): level=line.strip().split()[1] if line.strip().startswith("* library "): bs = line.strip().replace("* library ",'') #need to update these tags for NWChem solvation later if 'CPCM SOLVATION MODEL' in line.strip(): keyword_line_1 = "CPCM," if 'SMD CDS free energy correction energy' in line.strip(): keyword_line_2 = "SMD," if "Solvent: " in line.strip(): keyword_line_3 = line.strip().split()[-1] #need to update NWChem keyword for error calculation if 'Total times' in line: progress = 'Normal' elif 'error termination' in line: progress = 'Error' solvation_model = keyword_line_1 + keyword_line_2 + keyword_line_3 # Grab solvation models - Gaussian files if program == 'Gaussian': for i, line in enumerate(data): if '#' in line.strip() and a == 0: for j, line in enumerate(data[i:i + 10]): if '--' in line.strip(): a = a + 1 break if a != 0: break else: for k in range(len(line.strip().split("\n"))): keyword_line += line.strip().split("\n")[k] if 'Normal termination' in line: progress = 'Normal' elif 'Error termination' in line: progress = 'Error' keyword_line = keyword_line.lower() if 'scrf' not in keyword_line.strip(): solvation_model = "gas phase" else: start_scrf = keyword_line.strip().find('scrf') + 5 if start_scrf >= len(keyword_line): # Bare `scrf` with no body — Gaussian defaults to PCM/water. solvation_model = "scrf" elif keyword_line[start_scrf] == "(": end_scrf = keyword_line.find(")", start_scrf) solvation_model = "scrf=" + keyword_line[start_scrf:end_scrf] if solvation_model[-1] != ")": solvation_model = solvation_model + ")" else: start_scrf2 = keyword_line.strip().find('scrf') + 4 if keyword_line.find(" ", start_scrf) > -1: end_scrf = keyword_line.find(" ", start_scrf) else: end_scrf = len(keyword_line) if keyword_line[start_scrf2] == "(": solvation_model = "scrf=(" + keyword_line[start_scrf:end_scrf] if solvation_model[-1] != ")": solvation_model = solvation_model + ")" else: if keyword_line.find(" ", start_scrf) > -1: end_scrf = keyword_line.find(" ", start_scrf) else: end_scrf = len(keyword_line) solvation_model = "scrf=" + keyword_line[start_scrf:end_scrf] # xtb parsing for level of theory and solvation model elif program == 'xtb': # Hamiltonian (GFN0/1/2-xTB) → "level"; xtb has no basis set in the QC sense for ln in data: if 'Hamiltonian' in ln and 'xTB' in ln: for tok in ln.split(): if 'xTB' in tok: level = tok break if level != 'none': break bs = 'none' # Solvation from program-call line solvation_model = 'gas phase' for ln in data: stripped = ln.strip() if 'program call' in stripped and ':' in stripped: solvation_model = _xtb_solvation_from_call(stripped.split(':', 1)[1].strip()) break # Termination status for ln in data: if '[ERROR]' in ln or 'fatal error' in ln.lower(): progress = 'Error' break if 'finished run on' in ln: progress = 'Normal' # Q-Chem: parse $rem METHOD/BASIS from the input echo, solvation from # the run banner, and termination from "Thank you very much" footer. elif program == 'QChem': in_rem = False method, basis = 'none', 'none' cpcm = smd = solvent_name = '' for line in data: s = line.strip() if s.lower() == '$rem': in_rem = True continue if in_rem: if s == '$end' or s.startswith('$'): in_rem = False else: parts = s.split() if len(parts) >= 2: key = parts[0].lower() if key == 'method' and method == 'none': method = parts[1] elif key == 'basis' and basis == 'none': basis = parts[1] if 'Using C-PCM' in s or 'C-PCM dielectric factor' in s: cpcm = 'CPCM' elif 'Using the SMD solvation model' in s: smd = 'SMD' elif s.startswith('Solvent:'): try: solvent_name = s.split(':', 1)[1].strip() except IndexError: pass if 'Thank you very much for using Q-Chem' in line: progress = 'Normal' elif 'Q-Chem fatal error' in line or 'fatal error occurred' in line: progress = 'Error' level, bs = method, basis if smd: solvation_model = 'SMD,' + solvent_name if solvent_name else 'SMD' elif cpcm: solvation_model = 'CPCM,' + solvent_name if solvent_name else 'CPCM' # ASE extxyz: pull metadata directly from the comment-line key=values elif program == 'ase': info = _parse_extxyz_comment(data[1]) if len(data) >= 2 else {} lot = info.get('level_of_theory', '') if '/' in lot: level, bs = lot.split('/', 1) elif lot: level, bs = lot, 'none' else: level, bs = 'none', 'none' solvation_model = info.get('solvation_model', 'gas phase') or 'gas phase' try: float(info.get('scf_energy', '')) progress = 'Normal' except ValueError: progress = 'Incomplete' # ORCA parsing for solvation model and level of theory elif program == 'Orca': level, bs = _parse_orca_lot(data) keyword_line_1 = "gas phase" keyword_line_2 = '' keyword_line_3 = '' orca_abort_line = -1 last_freq_line = -1 for i, line in enumerate(data): if 'CPCM SOLVATION MODEL' in line.strip(): keyword_line_1 = "CPCM," if 'SMD CDS free energy correction energy' in line.strip(): keyword_line_2 = "SMD," if "Solvent: " in line.strip(): keyword_line_3 = line.strip().split()[-1] if 'ORCA TERMINATED NORMALLY' in line: progress = 'Normal' elif 'error termination' in line: progress = 'Error' if 'ORCA will abort' in line: orca_abort_line = i if 'VIBRATIONAL FREQUENCIES' in line and '---' not in line: last_freq_line = i # Abort after the last freq section means no valid thermo was produced if orca_abort_line > last_freq_line: progress = 'Error' solvation_model = keyword_line_1 + keyword_line_2 + keyword_line_3 level_of_theory = '/'.join([level, bs]) return level_of_theory, solvation_model, progress, orientation, dft_used
[docs] def gaussian_jobtype(filename): """Read the jobtype from a Gaussian archive string.""" job = '' with open(filename, encoding='utf-8', errors='replace') as f: for line in f: if line.strip().find('\\SP\\') > -1: job += 'SP' if line.strip().find('\\FOpt\\') > -1: job += 'GS' if line.strip().find('\\FTS\\') > -1: job += 'TS' if line.strip().find('\\Freq\\') > -1: job += 'Freq' return job
[docs] def parse_gaussian_thermo(file): """Parse Gaussian output for all thermochemistry-relevant data. Returns QCData with raw frequencies (negative = imaginary, no inversion applied). Frequency inversion is a user policy decision handled in thermo.py. Parameters ---------- file : str Path to Gaussian output file. """ qcdata = QCData(file=file, program='Gaussian') # Delegate solvation, dispersion, version, charge to parse_data (_, _, version_program, solvation_model, _, charge, empirical_dispersion, multiplicity) = parse_data(file) qcdata.version_program = version_program qcdata.solvation_model = solvation_model qcdata.empirical_dispersion = empirical_dispersion if charge is not None: qcdata.charge = charge if multiplicity is not None: qcdata.multiplicity = multiplicity # Job type from archive string qcdata.job_type = gaussian_jobtype(file) # Read file with open(file, encoding='utf-8', errors='replace') as f: g_output = f.readlines() # Auto-detect G4 composite method g4 = any('G4(0 K)' in line for line in g_output) # Detect ONIOM is_oniom = any('ONIOM: extrapolated energy' in line for line in g_output) qcdata.has_oniom = is_oniom # --- First pass: find link structure --- linkmax = 0 freqloc = 0 for line in g_output: if 'Normal termination' in line: linkmax += 1 if 'Frequencies --' in line: freqloc = linkmax # --- Second pass: extract data --- link = 0 frequency_wn = [] im_frequency_wn = [] fract_modelsys = [] per_atom_masses = [] # Gaussian's per-atom masses (respects iso= keyword) freq_started = False # True once we encounter the first "Frequencies --" in this link freq_done = False # True once VPT2 "Recovering" marker is seen (guards against duplicates) if freqloc == 0: freqloc = len(g_output) for i, line in enumerate(g_output): # Link counter if 'Normal termination' in line: link += 1 if link == freqloc: frequency_wn = [] im_frequency_wn = [] fract_modelsys = [] freq_started = False freq_done = False # Stop after freq link unless G4/composite if not g4 and link > freqloc: break # VPT2/anharmonic: "Recovering previously computed normal modes" means # the frequencies about to appear are a duplicate of the harmonic set if 'Recovering previously computed normal modes' in line: freq_done = True # Frequencies if not freq_done and line.strip().startswith('Frequencies -- '): freq_started = True if is_oniom: fract_line = g_output[i + 3] for j in range(2, 5): try: x = float(line.strip().split()[j]) if x > 0.0: frequency_wn.append(x) if is_oniom: try: y = float(fract_line.strip().split()[j]) / 100.0 y = float('{:.6f}'.format(y)) fract_modelsys.append(y) except (IndexError, ValueError): fract_modelsys.append(1.0) elif x < 0.0: im_frequency_wn.append(x) except IndexError: pass # --- SCF energy (all variants, last one wins) --- # Guard against VPT2 displaced geometry energies: once frequencies are # read, ignore further SCF Done / E2 / EUMP2 lines (but still allow # archive-line energies like CCSD= which appear after frequencies). elif not freq_started and line.strip().startswith('SCF Done:'): qcdata.scf_energy = float(line.strip().split()[4]) elif not freq_started and line.strip().startswith('E2('): spe_value = line.strip().split()[-1] qcdata.scf_energy = float(spe_value.replace('D', 'E')) elif line.strip().startswith('Counterpoise corrected energy'): qcdata.scf_energy = float(line.strip().split()[4]) elif not freq_started and 'EUMP2 =' in line.strip(): qcdata.scf_energy = float((line.strip().split()[5]).replace('D', 'E')) elif 'CCSD(T)=' in line.strip(): raw = line.strip().split('CCSD(T)=')[1].split('\\')[0].split()[0] qcdata.scf_energy = float(raw.replace('D', 'E')) elif 'CCSD=' in line.strip() and 'CCSD(T)' not in line.strip(): raw = line.strip().split('CCSD=')[1].split('\\')[0].split()[0] qcdata.scf_energy = float(raw.replace('D', 'E')) elif 'ONIOM: extrapolated energy' in line.strip(): qcdata.scf_energy = float(line.strip().split()[4]) elif line.strip().startswith('G4(0 K)'): qcdata.scf_energy = float(line.strip().split()[2]) if qcdata.zero_point_corr is not None: qcdata.scf_energy -= qcdata.zero_point_corr elif line.strip().startswith('E(ZPE)='): qcdata.zero_point_corr = float(line.strip().split()[1]) elif 'E(TD-HF/TD-DFT)' in line.strip(): qcdata.scf_energy = float(line.strip().split()[4]) elif ('Energy= ' in line.strip() and 'Predicted' not in line.strip() and 'Thermal' not in line.strip() and 'G4' not in line.strip()): qcdata.scf_energy = float(line.strip().split()[1]) # Coordinates: take the LAST "Standard orientation" or "Input orientation" block elif 'Standard orientation:' in line or 'Input orientation:' in line: atom_nums_tmp = [] cartesians_tmp = [] for k in range(i + 5, len(g_output)): if '-----' in g_output[k]: break parts = g_output[k].split() atom_nums_tmp.append(int(parts[1])) cartesians_tmp.append([float(parts[3]), float(parts[4]), float(parts[5])]) qcdata.atom_nums = atom_nums_tmp qcdata.cartesians = cartesians_tmp # Zero-point correction elif line.strip().startswith('Zero-point correction='): qcdata.zero_point_corr = float(line.strip().split()[2]) # Multiplicity elif 'Multiplicity' in line.strip(): try: qcdata.multiplicity = int(line.split('=')[-1].strip().split()[0]) except (ValueError, IndexError): qcdata.multiplicity = int(line.split()[-1]) # Per-atom isotope masses Gaussian printed for this freq job. Captured # so we can recompute high-precision rotational constants from the # geometry — the "Rotational temperatures" block prints only 3 sig figs, # which loses ~3 µHartree in G for heavy or floppy systems. elif (line.strip().startswith('Atom') and 'has atomic number' in line and 'and mass' in line): try: per_atom_masses.append(float(line.split('and mass')[1].split()[0])) except (ValueError, IndexError): pass # Molecular mass — also marks the end of one per-atom mass block, so # subsequent thermo sections (rare; G4 composites) start fresh. elif line.strip().startswith('Molecular mass:'): qcdata.molecular_mass = float(line.strip().split()[2]) # Rotational symmetry number elif line.strip().startswith('Rotational symmetry number'): qcdata.symmno = int((line.strip().split()[3]).split('.')[0]) # Point group / linearity elif line.strip().startswith('Full point group'): pg = line.strip().split()[3] qcdata.point_group = pg if pg in ('D*H', 'C*V'): qcdata.linear_mol = True # Rotational constants (GHz) elif line.strip().startswith('Rotational constants (GHZ):'): try: parts = line.strip().replace(':', ' ').split() qcdata.roconst = [float(parts[3]), float(parts[4]), float(parts[5])] except ValueError: if '********' in line.strip(): qcdata.linear_warning = True parts = line.strip().replace(':', ' ').split() qcdata.roconst = [float(parts[4]), float(parts[5])] # Rotational temperatures elif line.strip().startswith('Rotational temperature '): qcdata.rotemp = [float(line.strip().split()[3])] elif line.strip().startswith('Rotational temperatures'): try: qcdata.rotemp = [float(line.strip().split()[3]), float(line.strip().split()[4]), float(line.strip().split()[5])] except ValueError: if '********' in line.strip(): qcdata.linear_warning = True qcdata.rotemp = [float(line.strip().split()[4]), float(line.strip().split()[5])] # CPU time (not elif — checked independently for every line) if 'Job cpu time' in line.strip(): qcdata.cpu = [ int(line.split()[3]) + qcdata.cpu[0], int(line.split()[5]) + qcdata.cpu[1], int(line.split()[7]) + qcdata.cpu[2], 0 + qcdata.cpu[3], int(float(line.split()[9]) * 1000.0) + qcdata.cpu[4], ] qcdata.frequency_wn = frequency_wn qcdata.im_frequency_wn = im_frequency_wn if is_oniom: qcdata.fract_modelsys = fract_modelsys if qcdata.atom_nums: qcdata.atom_types = [periodictable[n] for n in qcdata.atom_nums] # High-precision rotational constants: Gaussian's "Rotational temperatures" # printout is only 3 sig figs, which propagates ~3 µHartree of error into G # via S_rot for systems with small θ_rot. Recompute MOI from the geometry # using the per-atom masses Gaussian itself reported (this respects the # iso= keyword), then derive roconst / rotemp at full double precision. # G4 composites can print masses for several thermo sections; the final # len(cartesians) entries always belong to the freq job we care about. if (qcdata.cartesians and len(per_atom_masses) >= len(qcdata.cartesians) and not qcdata.linear_warning and len(qcdata.rotemp) == 3): masses = np.asarray(per_atom_masses[-len(qcdata.cartesians):], float) coords = np.asarray(qcdata.cartesians, float) total_mass = float(masses.sum()) com = (masses[:, None] * coords).sum(axis=0) / total_mass r = coords - com Ixx = float((masses * (r[:, 1] ** 2 + r[:, 2] ** 2)).sum()) Iyy = float((masses * (r[:, 0] ** 2 + r[:, 2] ** 2)).sum()) Izz = float((masses * (r[:, 0] ** 2 + r[:, 1] ** 2)).sum()) Ixy = float(-(masses * r[:, 0] * r[:, 1]).sum()) Ixz = float(-(masses * r[:, 0] * r[:, 2]).sum()) Iyz = float(-(masses * r[:, 1] * r[:, 2]).sum()) inertia = np.array([[Ixx, Ixy, Ixz], [Ixy, Iyy, Iyz], [Ixz, Iyz, Izz]]) eigvals = np.sort(np.maximum(np.linalg.eigvalsh(inertia), 0.0)) if all(I > 1e-3 for I in eigvals): HC_OVER_KB = 1.4387768775 # K per cm⁻¹ roconst_cm = [16.857629 / I for I in eigvals] qcdata.roconst = [b * 29.9792458 for b in roconst_cm] qcdata.rotemp = [HC_OVER_KB * b for b in roconst_cm] qcdata.molecular_mass = total_mass _fill_mass_and_rotemp_from_geometry(qcdata) return qcdata
[docs] def parse_nwchem_thermo(file): """Parse NWChem output for all thermochemistry-relevant data. Returns QCData with raw frequencies (negative = imaginary, no inversion applied). Parameters ---------- file : str Path to NWChem output file. """ qcdata = QCData(file=file, program='NWChem') # Delegate version, charge, multiplicity to parse_data try: (_, _, version_program, solvation_model, _, charge, empirical_dispersion, multiplicity) = parse_data(file) qcdata.version_program = version_program qcdata.solvation_model = solvation_model qcdata.empirical_dispersion = empirical_dispersion if charge is not None: qcdata.charge = charge if multiplicity is not None: qcdata.multiplicity = multiplicity except (ValueError, IndexError): pass # NWChem has no archive string for job type; detect from content with open(file, encoding='utf-8', errors='replace') as f: g_output = f.readlines() frequency_wn = [] im_frequency_wn = [] for i, line in enumerate(g_output): # Frequencies (up to 6 per line) if line.strip().startswith('P.Frequency'): for j in range(1, 7): try: x = float(line.strip().split()[j]) if x > 0.0: frequency_wn.append(x) elif x < 0.0: im_frequency_wn.append(x) except IndexError: pass # SCF energy elif line.strip().startswith('Total DFT energy ='): qcdata.scf_energy = float(line.strip().split()[4]) # Zero-point correction elif line.strip().startswith('Zero-Point'): qcdata.zero_point_corr = float(line.strip().split()[8]) # Multiplicity — match input-deck echo "mult <N>" or runtime # "Spin multiplicity: <N>" (NWChem 7.x). elif 'mult ' in line.strip(): try: qcdata.multiplicity = int(line.split()[1]) except (ValueError, IndexError): qcdata.multiplicity = 1 elif 'Spin multiplicity:' in line: try: qcdata.multiplicity = int(line.split(":", 1)[1].strip().split()[0]) except (ValueError, IndexError): pass # Coordinates: take the LAST "Output coordinates in angstroms" block elif 'Output coordinates in angstroms' in line: atom_nums_tmp = [] cartesians_tmp = [] for k in range(i + 4, len(g_output)): cline = g_output[k].strip() if cline == '': break parts = cline.split() atom_nums_tmp.append(int(float(parts[2]))) cartesians_tmp.append([float(parts[3]), float(parts[4]), float(parts[5])]) qcdata.atom_nums = atom_nums_tmp qcdata.cartesians = cartesians_tmp # Molecular mass elif line.strip().find('mol. weight') != -1: qcdata.molecular_mass = float(line.strip().split()[-1][0:-1]) # Rotational symmetry number elif line.strip().find('symmetry #') != -1: qcdata.symmno = int(line.strip().split()[-1][0:-1]) # Point group / linearity elif line.strip().find('symmetry detected') != -1: pg = line.strip().split()[0] qcdata.point_group = pg if pg in ('D*H', 'C*V'): qcdata.linear_mol = True # Version (fallback if parse_data failed) elif 'nwchem branch' in line.strip() and not qcdata.version_program: qcdata.version_program = 'NWChem version ' + line.split()[3] # Rotational constants (convert cm⁻¹ to GHz) and temperatures elif line.strip().startswith('A=') or line.strip().startswith('B=') or line.strip().startswith('C='): letter = line.strip()[0] h = {'A': 0, 'B': 1, 'C': 2}[letter] qcdata.roconst[h] = float(line.strip().split()[1]) * 29.9792458 qcdata.rotemp[h] = float(line.strip().split()[4]) # CPU time if 'Total times' in line.strip(): secs = float(line.strip().split()[3][0:-1]) qcdata.cpu = [0, 0, 0, secs, 0.0] # Determine job type from content if len(frequency_wn) > 0 or len(im_frequency_wn) > 0: qcdata.job_type = 'Freq' else: qcdata.job_type = 'SP' qcdata.frequency_wn = frequency_wn qcdata.im_frequency_wn = im_frequency_wn if qcdata.atom_nums: qcdata.atom_types = [periodictable[n] for n in qcdata.atom_nums] _fill_mass_and_rotemp_from_geometry(qcdata) return qcdata
def _scale_cpu(cpu, factor): """Multiply a [days, hours, mins, secs, msecs] tuple by an integer factor. Used by the ORCA parser to convert wall time × nprocs → effective CPU. """ total_ms = ( cpu[0] * 86_400_000 + cpu[1] * 3_600_000 + cpu[2] * 60_000 + cpu[3] * 1_000 + cpu[4] ) * factor days, total_ms = divmod(total_ms, 86_400_000) hours, total_ms = divmod(total_ms, 3_600_000) mins, total_ms = divmod(total_ms, 60_000) secs, msecs = divmod(total_ms, 1_000) return [days, hours, mins, secs, msecs]
[docs] def parse_orca_thermo(file): """Parse ORCA output for all thermochemistry-relevant data. Uses native line-by-line parsing. Parameters ---------- file : str Path to ORCA output file. """ qcdata = QCData(file=file, program='Orca') with open(file, encoding='utf-8', errors='replace') as f: output = f.readlines() frequency_wn = [] im_frequency_wn = [] in_freq_section = False solvation_type = '' solvent_name = '' applied_freq_scale_factor = 1.0 _has_opt = False _has_ts = False _has_freq_kw = False _orca_abort_line = -1 _last_freq_line = -1 for i, line in enumerate(output): stripped = line.strip() # --- Frequency section state machine --- if 'VIBRATIONAL FREQUENCIES' in stripped and '---' not in stripped: in_freq_section = True frequency_wn = [] im_frequency_wn = [] _last_freq_line = i elif in_freq_section and 'cm**-1' in stripped: parts = stripped.split() try: freq_val = float(parts[1]) if freq_val > 0.0: frequency_wn.append(freq_val) elif freq_val < 0.0: im_frequency_wn.append(freq_val) # freq_val == 0.0 → skip (translational/rotational modes) except (IndexError, ValueError): pass elif in_freq_section: if stripped == '' and (frequency_wn or im_frequency_wn): in_freq_section = False # --- Main field parsing --- # Coordinates: take the LAST "CARTESIAN COORDINATES (ANGSTROEM)" block if 'CARTESIAN COORDINATES (ANGSTROEM)' in stripped: atom_nums_tmp = [] atom_types_tmp = [] cartesians_tmp = [] for k in range(i + 2, len(output)): cline = output[k].strip() if cline == '' or '---' in cline: break parts = cline.split() elem = parts[0] atom_types_tmp.append(elem) atom_nums_tmp.append(element_id(elem, num=True)) cartesians_tmp.append([float(parts[1]), float(parts[2]), float(parts[3])]) qcdata.atom_nums = atom_nums_tmp qcdata.atom_types = atom_types_tmp qcdata.cartesians = cartesians_tmp # SCF energy (last occurrence wins for linked jobs) elif stripped.startswith('FINAL SINGLE POINT ENERGY'): qcdata.scf_energy = float(stripped.split()[-1]) # Collect input keywords for job type detection (handled after loop) elif '|' in line and '>' in stripped and '!' in stripped: kw = stripped.split('!', 1)[1].upper() if 'OPTTS' in kw: _has_ts = True elif 'OPT' in kw: _has_opt = True if 'FREQ' in kw: _has_freq_kw = True # Charge elif 'Total Charge' in stripped and '....' in stripped: qcdata.charge = int(stripped.split()[-1]) # Multiplicity elif 'Multiplicity' in stripped and 'Mult' in stripped and '....' in stripped: qcdata.multiplicity = int(stripped.split()[-1]) # Program version elif 'Program Version' in stripped: qcdata.version_program = 'ORCA version ' + line.split()[2] # Zero-point energy (Eh) elif stripped.startswith('Zero point energy') and '...' in stripped: parts = stripped.split() try: dot_idx = parts.index('...') qcdata.zero_point_corr = float(parts[dot_idx + 1]) except (ValueError, IndexError): pass # Molecular mass (AMU) elif 'Total Mass' in stripped and '...' in stripped: parts = stripped.split() try: dot_idx = parts.index('...') qcdata.molecular_mass = float(parts[dot_idx + 1]) except (ValueError, IndexError): pass # Rotational constants in cm⁻¹ → convert to GHz and rotational temps elif stripped.startswith('Rotational constants in cm-1:'): rparts = stripped.split(':')[1].split() try: roconst_cm = [float(rparts[0]), float(rparts[1]), float(rparts[2])] qcdata.roconst = [b * 29.9792458 for b in roconst_cm] # Rotational temperature: T_rot = h*c*B/k_B where B in cm⁻¹ HC_OVER_KB = 1.4387768775 # K per cm⁻¹ qcdata.rotemp = [HC_OVER_KB * b for b in roconst_cm] except (IndexError, ValueError): pass # Point group and symmetry number (same line) elif 'Point Group:' in stripped and 'Symmetry Number:' in stripped: pg_part = stripped.split('Point Group:')[1].split(',')[0].strip() qcdata.point_group = pg_part # Linear point groups: ORCA 6 prints "C(inf)v"/"D(inf)h", # ORCA 5 prints "Cinfv"/"Dinfh" — match both via the "inf" stem. if 'inf' in pg_part.lower(): qcdata.linear_mol = True try: symm_part = stripped.split('Symmetry Number:')[1].strip() qcdata.symmno = int(symm_part) except (ValueError, IndexError): pass # Solvation model detection elif 'CPCM SOLVATION MODEL' in stripped: solvation_type = 'CPCM' elif 'SMD CDS free energy correction energy' in stripped: solvation_type = 'SMD' elif stripped.startswith('Solvent:') and '...' in stripped: solvent_name = stripped.split()[-1] # Frequency scaling factor (ORCA pre-applies this to reported frequencies) elif stripped.startswith('Scaling factor for frequencies'): try: applied_freq_scale_factor = float(stripped.split('=')[1].split('(')[0].strip()) except (IndexError, ValueError): pass # CPU time. ORCA prints wall time only ("TOTAL RUN TIME:"); we scale # by the number of MPI processes (parsed from "Program running with # N parallel MPI-processes") to get an effective CPU time, matching # how Gaussian/NWChem/xTB report theirs. elif stripped.startswith('TOTAL RUN TIME:'): parts = stripped.split() try: cpu_list = [ int(parts[3]), # days int(parts[5]), # hours int(parts[7]), # minutes int(parts[9]), # seconds int(parts[11]), # msec ] if qcdata.nprocs > 1: cpu_list = _scale_cpu(cpu_list, qcdata.nprocs) qcdata.cpu = cpu_list except (IndexError, ValueError): pass # MPI process count (printed once per task; we keep the first hit). elif 'parallel MPI-processes' in stripped and qcdata.nprocs == 1: try: qcdata.nprocs = int(stripped.split('with')[1].split('parallel')[0].strip()) except (IndexError, ValueError): pass # ORCA abort: optimization failed before freq calculation could run. if 'ORCA will abort' in stripped: _orca_abort_line = i # If the last abort comes after the last freq section (or there are no freqs), # any frequencies/ZPE present are from calc_hess, not the actual freq job — discard. if _orca_abort_line > _last_freq_line: frequency_wn = [] im_frequency_wn = [] qcdata.zero_point_corr = None # Assemble solvation model if solvation_type: if solvent_name: qcdata.solvation_model = solvation_type + ',' + solvent_name else: qcdata.solvation_model = solvation_type # Un-scale frequencies if ORCA applied a scaling factor, so that # frequency_wn always contains unscaled values (matching Gaussian convention) qcdata.applied_freq_scale_factor = applied_freq_scale_factor if applied_freq_scale_factor != 1.0 and applied_freq_scale_factor > 0.0: frequency_wn = [f / applied_freq_scale_factor for f in frequency_wn] im_frequency_wn = [f / applied_freq_scale_factor for f in im_frequency_wn] qcdata.frequency_wn = frequency_wn qcdata.im_frequency_wn = im_frequency_wn # Fallback: parse coordinates from "* xyz charge mult" input block # when CARTESIAN COORDINATES (ANGSTROEM) block is absent (e.g., thermo-only re-analysis) if not qcdata.atom_nums: for i, line in enumerate(output): stripped = line.strip() if stripped.startswith('|') and '* xyz' in stripped: atom_nums_tmp = [] atom_types_tmp = [] cartesians_tmp = [] for k in range(i + 1, len(output)): cline = output[k].strip() # Input lines start with "| N>" prefix if '|' in cline: content = cline.split('>', 1)[-1].strip() if '>' in cline else cline.strip('| ').strip() else: break if content == '*' or content == '' or 'END OF INPUT' in content: break parts = content.split() if len(parts) >= 4: elem = parts[0] atom_types_tmp.append(elem) atom_nums_tmp.append(element_id(elem, num=True)) cartesians_tmp.append([float(parts[1]), float(parts[2]), float(parts[3])]) if atom_nums_tmp: qcdata.atom_nums = atom_nums_tmp qcdata.atom_types = atom_types_tmp qcdata.cartesians = cartesians_tmp break # For linear molecules, filter rotemp to non-zero values only # (linear molecules have one near-zero rotational constant along the axis) if qcdata.linear_mol: nonzero_rotemp = [t for t in qcdata.rotemp if t > 1e-10] if nonzero_rotemp: qcdata.rotemp = nonzero_rotemp # Determine job type from combined input keywords and output content has_freq = _has_freq_kw or len(frequency_wn) > 0 or len(im_frequency_wn) > 0 if _has_ts: qcdata.job_type = 'TSFreq' if has_freq else 'TS' elif _has_opt: qcdata.job_type = 'GSFreq' if has_freq else 'GS' elif has_freq: qcdata.job_type = 'Freq' else: qcdata.job_type = 'SP' _fill_mass_and_rotemp_from_geometry(qcdata) return qcdata
def _xtb_solvation_from_call(call_line): """Parse the xtb ``program call`` line for an implicit-solvation flag. Returns a string like 'GBSA,water', 'CPCM-X,dimethylsulfoxide', 'ALPB,toluene', or 'gas phase'. """ tokens = call_line.split() for i, tok in enumerate(tokens): if tok in ('-g', '--gbsa') and i + 1 < len(tokens): return 'GBSA,' + tokens[i + 1] if tok == '--cpcmx' and i + 1 < len(tokens): return 'CPCM-X,' + tokens[i + 1] if tok == '--alpb' and i + 1 < len(tokens): return 'ALPB,' + tokens[i + 1] return 'gas phase' def _xtb_read_xyz_fallback(file): """Read the paired .xyz file when the .out has no final-geometry block. Returns (atom_nums, atom_types, cartesians) — empty lists if not readable. """ xyz = os.path.splitext(file)[0] + '.xyz' if not os.path.exists(xyz): return [], [], [] try: with open(xyz, encoding='utf-8', errors='replace') as f: lines = f.readlines() except OSError: return [], [], [] if len(lines) < 2: return [], [], [] try: natoms = int(lines[0].strip()) except (ValueError, IndexError): return [], [], [] atom_nums, atom_types, cartesians = [], [], [] for line in lines[2:2 + natoms]: parts = line.split() if len(parts) < 4: continue elem = parts[0] try: atom_types.append(elem) atom_nums.append(element_id(elem, num=True)) cartesians.append([float(parts[1]), float(parts[2]), float(parts[3])]) except ValueError: continue return atom_nums, atom_types, cartesians
[docs] def parse_xtb_thermo(file): """Parse xtb output for all thermochemistry-relevant data. xtb takes a ``.xyz`` coordinate file plus CLI flags rather than an input deck. The parser handles single-point (``xtb file.xyz``), Hessian-only (``--hess``), and optimization+Hessian (``--ohess``) jobs, plus implicit solvation (GBSA via ``-g``, ALPB via ``--alpb``, CPCM-X via ``--cpcmx``). For runs without ``--opt``/``--ohess`` (no optimized-geometry block in the output), Cartesians fall back to the paired ``.xyz`` input file. Parameters ---------- file : str Path to xtb output file. """ qcdata = QCData(file=file, program='xtb') with open(file, encoding='utf-8', errors='replace') as f: output = f.readlines() # Empty / aborted runs: bail out early but record the program. if not output or any('[ERROR]' in line for line in output): if any('xtb version' in line for line in output): for line in output: if 'xtb version' in line: parts = line.split() try: qcdata.version_program = 'xtb version ' + parts[parts.index('version') + 1] except (ValueError, IndexError): pass break return qcdata HC_OVER_KB = 1.4387768775 # K per cm⁻¹ frequency_wn = [] im_frequency_wn = [] last_freq_idx = -1 program_call = '' # First pass: locate the LAST vibrational-frequencies header. Linear # molecules and atoms use the ``vibrational frequencies (cm⁻¹)`` form; # everything else uses ``projected vibrational frequencies (cm⁻¹)``. for i, line in enumerate(output): if 'vibrational frequencies' in line and '(cm' in line: last_freq_idx = i # Second pass: extract scalar fields and geometry; freq parsing happens once for i, line in enumerate(output): stripped = line.strip() # --- Version if 'xtb version' in stripped and not qcdata.version_program: parts = stripped.split() try: qcdata.version_program = 'xtb version ' + parts[parts.index('version') + 1] except (ValueError, IndexError): pass # --- Program call line (for solvation flag parsing) elif 'program call' in stripped and ':' in stripped and not program_call: program_call = stripped.split(':', 1)[1].strip() # --- Charge: ": net charge <n> :" elif 'net charge' in stripped and stripped.startswith(':'): parts = stripped.split() try: qcdata.charge = int(parts[parts.index('charge') + 1]) except (ValueError, IndexError): pass # --- Multiplicity from "unpaired electrons" (mult = unpaired + 1) elif 'unpaired electrons' in stripped and stripped.startswith(':'): parts = stripped.split() try: unpaired = int(parts[parts.index('electrons') + 1]) qcdata.multiplicity = unpaired + 1 except (ValueError, IndexError): pass # --- Electronic SCF energy ":: total energy <E> Eh ::" # Last occurrence wins (final SCC after opt, or sole SCC for SP) elif stripped.startswith(':: total energy'): parts = stripped.split() try: qcdata.scf_energy = float(parts[3]) except (ValueError, IndexError): pass # --- Zero-point energy elif stripped.startswith(':: zero point energy'): parts = stripped.split() try: qcdata.zero_point_corr = float(parts[4]) except (ValueError, IndexError): pass # --- Molecular mass: "molecular mass/u : <m>" elif stripped.startswith('molecular mass/u'): parts = stripped.split() try: qcdata.molecular_mass = float(parts[-1]) except (ValueError, IndexError): pass # --- Rotational constants in cm⁻¹: "rotational constants/cm⁻¹ : B1 B2 B3" # xtb prints "Infinity" for atoms (no rotation); skip in that case. elif stripped.startswith('rotational constants/cm'): parts = stripped.split(':', 1)[1].split() if ':' in stripped else stripped.split() if 'Infinity' in parts[:3]: pass # atom: leave roconst/rotemp at defaults else: try: roconst_cm = [float(parts[0]), float(parts[1]), float(parts[2])] qcdata.roconst = [b * 29.9792458 for b in roconst_cm] qcdata.rotemp = [HC_OVER_KB * b for b in roconst_cm] except (IndexError, ValueError): pass # --- Linear marker (thermo SETUP block) elif 'linear?' in stripped and stripped.startswith(':'): if 'true' in stripped: qcdata.linear_mol = True # --- Point group from "symmetry <pg>" elif stripped.startswith(':') and 'symmetry' in stripped and 'rotational' not in stripped: parts = stripped.split() try: idx = parts.index('symmetry') if idx + 1 < len(parts) and parts[idx + 1] != ':': pg = parts[idx + 1] if pg not in ('GBSA', 'found', 'elements:'): qcdata.point_group = pg except ValueError: pass # --- Symmetry number ": rotational number <n> :" elif 'rotational number' in stripped and stripped.startswith(':'): parts = stripped.split() try: qcdata.symmno = int(parts[parts.index('number') + 1]) except (ValueError, IndexError): pass # --- Final optimized geometry block (only present for --opt/--ohess) elif stripped == 'final structure:': # Skip the "===" line; line i+1 is "===", i+2 is atom count, i+3 is comment try: natoms = int(output[i + 2].strip()) except (ValueError, IndexError): continue atom_nums_tmp = [] atom_types_tmp = [] cartesians_tmp = [] for k in range(i + 4, i + 4 + natoms): if k >= len(output): break parts = output[k].split() if len(parts) < 4: break try: atom_types_tmp.append(parts[0]) atom_nums_tmp.append(element_id(parts[0], num=True)) cartesians_tmp.append([float(parts[1]), float(parts[2]), float(parts[3])]) except ValueError: break qcdata.atom_nums = atom_nums_tmp qcdata.atom_types = atom_types_tmp qcdata.cartesians = cartesians_tmp # --- CPU time from "total:" block: "* cpu-time: <D> d, <H> h, <M> min, <S> sec" # xtb prints two spaces ("* cpu-time:") so match the substring. elif 'cpu-time:' in stripped and stripped.startswith('*') and qcdata.cpu == [0, 0, 0, 0, 0]: # First (= "total:") cpu-time line; subsequent are SCF/ANC/hess subtimers parts = stripped.replace(',', ' ').split() try: d_idx = parts.index('d') h_idx = parts.index('h') m_idx = parts.index('min') s_idx = parts.index('sec') days = int(parts[d_idx - 1]) hours = int(parts[h_idx - 1]) mins = int(parts[m_idx - 1]) secs_float = float(parts[s_idx - 1]) secs = int(secs_float) msecs = int((secs_float - secs) * 1000) qcdata.cpu = [days, hours, mins, secs, msecs] except (ValueError, IndexError): pass # Frequencies from the LAST projected-vibrational-frequencies block. # xtb prints all 3N modes; the first 5 (linear) or 6 (non-linear) are the # zero translational/rotational eigenvalues from projection — skip them by # filtering |v| < 1.0 cm⁻¹. Negatives below that are imaginary modes. if last_freq_idx >= 0: for k in range(last_freq_idx + 1, len(output)): parts = output[k].split() if not parts or parts[0] != 'eigval': break for tok in parts[2:]: try: v = float(tok) except ValueError: continue if abs(v) < 1.0: continue if v > 0: frequency_wn.append(v) else: im_frequency_wn.append(v) qcdata.frequency_wn = frequency_wn qcdata.im_frequency_wn = im_frequency_wn # Multiplicity: if `unpaired electrons` was missing from output, leave at default 1. # Solvation: parse from program call line if program_call: qcdata.solvation_model = _xtb_solvation_from_call(program_call) else: qcdata.solvation_model = 'gas phase' # xtb has no DFT dispersion; GFN includes its own D3/D4 implicitly. qcdata.empirical_dispersion = 'No empirical dispersion detected' # Geometry fallback: SP-only and --hess runs lack a 'final structure:' block. if not qcdata.atom_nums: atom_nums, atom_types, cartesians = _xtb_read_xyz_fallback(file) if atom_nums: qcdata.atom_nums = atom_nums qcdata.atom_types = atom_types qcdata.cartesians = cartesians # Mass / rotational fallback: --hess-only runs print frequencies but skip # the THERMODYNAMIC block, so 'molecular mass / u' and 'rotational # constants / cm⁻¹' are absent. Recompute from the geometry we just loaded. _fill_mass_and_rotemp_from_geometry(qcdata) # Linear molecules: keep only physical rotational temperatures. xtb # inverts a near-zero moment of inertia for the linear axis, producing # either a huge positive number (e.g. 1e27 K) or a huge negative one; # restrict to 0 < T_rot < 1e10 K to drop both pathological cases. if qcdata.linear_mol and qcdata.rotemp: physical = [t for t in qcdata.rotemp if 0.0 < t < 1.0e10] if physical: qcdata.rotemp = physical qcdata.roconst = [t / HC_OVER_KB * 29.9792458 for t in physical] # Job type from observed content has_opt_geom = any('final structure:' in line for line in output) has_freq = bool(frequency_wn) or bool(im_frequency_wn) if has_opt_geom and has_freq: qcdata.job_type = 'GSFreq' elif has_opt_geom: qcdata.job_type = 'GS' elif has_freq: qcdata.job_type = 'Freq' else: qcdata.job_type = 'SP' return qcdata
# Bohr → Ångström conversion (squared, for amu*Bohr² → amu*Ų) _BOHR2_TO_ANGSTROM2 = 0.5291772109 ** 2
[docs] def parse_qchem_thermo(file): """Parse a Q-Chem 6 output file for all thermochemistry-relevant data. Handles single-job and multi-job (`@@@`-separated opt + freq + sp) inputs: the LAST occurrence wins for SCF energy, geometry, and frequencies, so the converged opt geometry and post-correlation energies are picked up correctly. Recognises native HF/DFT total energy, MP2 total energy, CCSD total energy, and CCSD(T) total energy via the precedence CCSD(T) > CCSD > MP2 > Total energy. Parameters ---------- file : str Path to a Q-Chem 6 output file. """ qcdata = QCData(file=file, program='QChem') with open(file, encoding='utf-8', errors='replace') as f: output = f.readlines() if not output: return qcdata HC_OVER_KB = 1.4387768775 # K per cm⁻¹ # First pass: locate the last freq block, last geometry block, last # thermo header, and the input $molecule + $rem echoes (for charge, # multiplicity, and the requested correlation method). last_geom_header = -1 last_thermo_header = -1 last_freq_header = -1 molecule_block_lines = [] rem_methods = [] in_molecule_block = False in_rem_block = False user_input_seen = False for i, line in enumerate(output): stripped = line.strip() if 'You are running Q-Chem version' in stripped: try: qcdata.version_program = 'Q-Chem version ' + stripped.split('version:')[1].strip() except IndexError: pass elif 'User input:' in stripped: user_input_seen = True elif user_input_seen and stripped == '$molecule' and not molecule_block_lines: in_molecule_block = True continue elif in_molecule_block: if stripped == '$end' or stripped.startswith('$'): in_molecule_block = False elif stripped: molecule_block_lines.append(stripped) elif user_input_seen and stripped.lower() == '$rem': in_rem_block = True continue elif in_rem_block: if stripped == '$end' or stripped.startswith('$'): in_rem_block = False elif stripped and not stripped.startswith('!'): parts = stripped.split() if len(parts) >= 2 and parts[0].lower() == 'method': rem_methods.append(parts[1].lower()) if 'Standard Nuclear Orientation (Angstroms)' in line: last_geom_header = i # Anchor on the harmonic block ("AT 298.15 K AND ..."); skip the # anharmonic VPT2/TOSH summaries that print only ZPE. if 'STANDARD THERMODYNAMIC QUANTITIES AT' in line: last_thermo_header = i if stripped.startswith('Frequency:'): last_freq_header = i # Method of the FINAL job step decides which energy line to trust. final_method = rem_methods[-1] if rem_methods else '' # Charge / multiplicity from the input $molecule block. # First non-`READ` line is "<charge> <mult>". for line in molecule_block_lines: s = line.strip() if s.upper() == 'READ': continue parts = s.split() if len(parts) >= 2: try: qcdata.charge = int(parts[0]) qcdata.multiplicity = int(parts[1]) except ValueError: pass break # Multiplicity fallback from "There are N alpha and M beta electrons". if qcdata.multiplicity == 1: for line in output: if 'alpha and' in line and 'beta electrons' in line: parts = line.split() try: nalpha = int(parts[parts.index('alpha') - 1]) nbeta = int(parts[parts.index('beta') - 1]) qcdata.multiplicity = abs(nalpha - nbeta) + 1 except (ValueError, IndexError): pass # Energies — collect candidates; pick by FINAL method, not by precedence. # Q-Chem prints "MP2 total energy" as an extra analysis even when the # method is e.g. B2PLYP (a double hybrid that's properly captured by the # SCF "Total energy" line), so we can't blindly prefer correlated lines. scf_total = None mp2_total = None ccsd_total = None ccsdt_total = None for line in output: stripped = line.strip() if stripped.startswith('Total energy ='): try: scf_total = float(stripped.split('=')[1].split()[0]) except (IndexError, ValueError): pass elif 'MP2' in stripped and 'total energy =' in stripped: try: mp2_total = float(stripped.split('=')[1].split()[0]) except (IndexError, ValueError): pass elif stripped.startswith('CCSD total energy') or stripped.startswith('CCSD total energy'): try: ccsd_total = float(stripped.split('=')[1].split()[0]) except (IndexError, ValueError): pass elif stripped.startswith('CCSD(T) total energy'): try: ccsdt_total = float(stripped.split('=')[1].split()[0]) except (IndexError, ValueError): pass if 'ccsd(t)' in final_method and ccsdt_total is not None: qcdata.scf_energy = ccsdt_total elif final_method == 'ccsd' and ccsd_total is not None: qcdata.scf_energy = ccsd_total elif final_method in ('mp2', 'rimp2', 'ump2') and mp2_total is not None: qcdata.scf_energy = mp2_total else: qcdata.scf_energy = scf_total # Solvation solvation_type = '' solvent_name = '' for line in output: stripped = line.strip() if 'Using C-PCM' in stripped or 'C-PCM dielectric factor' in stripped: solvation_type = solvation_type or 'CPCM' elif 'Using the SMD solvation model' in stripped: solvation_type = 'SMD' elif stripped.startswith('Solvent:'): try: solvent_name = stripped.split(':', 1)[1].strip() except IndexError: pass if solvation_type: qcdata.solvation_model = (solvation_type + ',' + solvent_name) if solvent_name else solvation_type # Final geometry — read from the LAST Standard Nuclear Orientation block. if last_geom_header >= 0: atom_nums, atom_types, cartesians = [], [], [] # Skip 2 header lines + the dashes line before atoms. i = last_geom_header + 3 while i < len(output): parts = output[i].split() if len(parts) >= 5: try: int(parts[0]) # row index elem = parts[1] xyz = [float(parts[2]), float(parts[3]), float(parts[4])] atom_types.append(elem) atom_nums.append(element_id(elem, num=True)) cartesians.append(xyz) i += 1 continue except ValueError: pass break qcdata.atom_types = atom_types qcdata.atom_nums = atom_nums qcdata.cartesians = cartesians # Frequencies — collect from contiguous Frequency: lines starting at the last header. if last_freq_header >= 0: # Walk forward from the last_freq_header but we want all preceding # contiguous Frequency: lines that belong to the same final block. # Strategy: scan the file once more and collect freqs from the LAST # contiguous run of Frequency: lines (separated only by columns of # mode data, never by another major section header). freq_blocks = [] current = [] for line in output: stripped = line.strip() if stripped.startswith('Frequency:'): try: nums = [float(x) for x in stripped.split(':', 1)[1].split()] current.extend(nums) except ValueError: pass elif current and ('STANDARD THERMODYNAMIC' in line or 'Standard Nuclear Orientation' in line): # End of a freq block; new section ahead. freq_blocks.append(current) current = [] if current: freq_blocks.append(current) if freq_blocks: freqs = freq_blocks[-1] qcdata.frequency_wn = [f for f in freqs if f >= 0] qcdata.im_frequency_wn = [f for f in freqs if f < 0] # Point group: printed once per SCF (before the thermo block); take LAST. for line in output: if 'Molecular Point Group' in line: try: after = line.split('Molecular Point Group', 1)[1].strip() qcdata.point_group = after.split()[0] except (IndexError, ValueError): pass # Thermo block: ZPE, mass, symmno, principal moments (last block wins). if last_thermo_header >= 0: for line in output[last_thermo_header:]: stripped = line.strip() if stripped.startswith('Zero point vibrational energy:'): # "Zero point vibrational energy: 14.050 kcal/mol" try: zpe_kcal = float(stripped.split(':')[1].split()[0]) qcdata.zero_point_corr = zpe_kcal / 627.509541 except (IndexError, ValueError): pass elif stripped.startswith('Molecular Mass:'): try: qcdata.molecular_mass = float(stripped.split(':')[1].split()[0]) except (IndexError, ValueError): pass elif stripped.startswith('Eigenvalues --'): try: eigs_bohr2 = [float(x) for x in stripped.split('--', 1)[1].split()][:3] eigs_a2 = [e * _BOHR2_TO_ANGSTROM2 for e in eigs_bohr2] nonzero = [e for e in eigs_a2 if e > 1e-6] if len(nonzero) == 2: # linear qcdata.linear_mol = True B_cm = 16.857629 / nonzero[-1] qcdata.roconst = [B_cm * 29.9792458] * 3 qcdata.rotemp = [HC_OVER_KB * B_cm] * 3 elif len(nonzero) == 3: roconst_cm = [16.857629 / I for I in nonzero] # noqa: E741 qcdata.roconst = [b * 29.9792458 for b in roconst_cm] qcdata.rotemp = [HC_OVER_KB * b for b in roconst_cm] except (IndexError, ValueError): pass elif 'Rotational Symmetry Number is' in stripped: try: qcdata.symmno = int(stripped.split('is')[1].split()[0]) except (IndexError, ValueError): pass # CPU time: "Total job time: 1.70s(wall), 21.27s(cpu)" — take the (cpu) # value (the wall × ncores estimate Q-Chem prints) so the sum matches # the Gaussian/NWChem/xTB convention of true summed CPU. for line in reversed(output): if 'Total job time:' in line: try: cpu_s = float(line.split(',')[1].split('s(cpu)')[0].strip()) days = int(cpu_s // 86400) rem = cpu_s - days * 86400 hours = int(rem // 3600) rem -= hours * 3600 mins = int(rem // 60) rem -= mins * 60 secs = int(rem) ms = int(round((rem - secs) * 1000)) qcdata.cpu = [days, hours, mins, secs, ms] except (IndexError, ValueError): pass break # Job type from observed content. has_freq = bool(qcdata.frequency_wn) or bool(qcdata.im_frequency_wn) has_geom_block = last_geom_header >= 0 if qcdata.im_frequency_wn: qcdata.job_type = 'TS' elif has_geom_block and has_freq: qcdata.job_type = 'GSFreq' elif has_freq: qcdata.job_type = 'Freq' else: qcdata.job_type = 'SP' _fill_mass_and_rotemp_from_geometry(qcdata) return qcdata
# Regex matching extxyz comment-line tokens: bare key=value or key="value with spaces" _EXTXYZ_TOKEN = re.compile(r'(\w+)=("(?:[^"\\]|\\.)*"|\S+)') def _parse_extxyz_comment(line): """Parse an extxyz comment line into a {key: str} dict. Strips surrounding double quotes from quoted values; leaves unquoted values untouched. Caller is responsible for type coercion. """ out = {} for key, val in _EXTXYZ_TOKEN.findall(line): if len(val) >= 2 and val[0] == '"' and val[-1] == '"': val = val[1:-1] out[key] = val return out def _extxyz_bool(s): """Decode an extxyz boolean (T/F/True/False) to a Python bool.""" return str(s).strip().lower() in ('t', 'true', '1', 'yes')
[docs] def parse_ase_thermo(file): """Parse a GoodVibes ASE thermo extxyz file. The format is standard extended XYZ with thermochemistry metadata in the second line (key=value, ASE Atoms.info convention). Required keys: ``program=ase``, ``scf_energy``. Frequencies are space-separated in cm-1 (negatives are imaginary). See tests/ase/README.md for the full spec. """ qcdata = QCData(file=file, program='ase') with open(file, encoding='utf-8', errors='replace') as f: lines = f.readlines() if len(lines) < 2: return qcdata try: natoms = int(lines[0].strip()) except ValueError: return qcdata info = _parse_extxyz_comment(lines[1]) # Provenance qcdata.version_program = info.get('ase_version', 'ase') # Charge / multiplicity (defaults: 0 / 1) try: qcdata.charge = int(float(info.get('charge', '0'))) except ValueError: qcdata.charge = 0 try: qcdata.multiplicity = int(float(info.get('multiplicity', '1'))) except ValueError: qcdata.multiplicity = 1 # Electronic energy (Hartree by default; eV converted) if 'scf_energy' in info: try: e = float(info['scf_energy']) units = info.get('scf_energy_units', 'Hartree').lower() if units in ('ev',): e = e / 27.211386245988 elif units in ('kcal/mol', 'kcalmol'): e = e / 627.509541 elif units in ('kj/mol', 'kjmol'): e = e / 2625.4996394799 qcdata.scf_energy = e except ValueError: pass # Zero-point energy (Hartree) if 'zpe' in info: try: qcdata.zero_point_corr = float(info['zpe']) except ValueError: pass # Frequencies (cm-1; negatives are imaginary) if 'frequencies' in info: for tok in info['frequencies'].split(): try: v = float(tok) except ValueError: continue if v < 0: qcdata.im_frequency_wn.append(v) else: qcdata.frequency_wn.append(v) # Optional model-chemistry metadata qcdata.solvation_model = info.get('solvation_model', '') qcdata.empirical_dispersion = info.get('empirical_dispersion', '') qcdata.point_group = info.get('point_group', '') if 'symmno' in info: try: qcdata.symmno = int(float(info['symmno'])) except ValueError: pass if 'linear_mol' in info: qcdata.linear_mol = _extxyz_bool(info['linear_mol']) if 'applied_freq_scale_factor' in info: try: qcdata.applied_freq_scale_factor = float(info['applied_freq_scale_factor']) except ValueError: pass # Atom block: species, x, y, z (additional columns ignored) atom_nums, atom_types, cartesians = [], [], [] for line in lines[2:2 + natoms]: parts = line.split() if len(parts) < 4: continue elem = parts[0] try: xyz = [float(parts[1]), float(parts[2]), float(parts[3])] except ValueError: continue atom_types.append(elem) atom_nums.append(element_id(elem, num=True)) cartesians.append(xyz) qcdata.atom_types = atom_types qcdata.atom_nums = atom_nums qcdata.cartesians = cartesians # Molecular mass: explicit override, else compute from atomic masses if 'molecular_mass' in info: try: qcdata.molecular_mass = float(info['molecular_mass']) except ValueError: pass if not qcdata.molecular_mass and atom_types: qcdata.molecular_mass = sum(ATOMIC_MASSES.get(a, 0.0) for a in atom_types) # Rotational constants: explicit override (cm-1), else derived from inertia HC_OVER_KB = 1.4387768775 # K per cm⁻¹ if 'roconst_cm' in info: try: roconst_cm = [float(t) for t in info['roconst_cm'].split()][:3] qcdata.roconst = [b * 29.9792458 for b in roconst_cm] qcdata.rotemp = [HC_OVER_KB * b for b in roconst_cm] except ValueError: pass elif 'rotemp' in info: try: qcdata.rotemp = [float(t) for t in info['rotemp'].split()][:3] qcdata.roconst = [t / HC_OVER_KB * 29.9792458 for t in qcdata.rotemp] except ValueError: pass if not any(qcdata.rotemp) and len(atom_types) >= 2: _mass, eigvals = _principal_moments_of_inertia(atom_types, cartesians) # Linear molecules: smallest principal moment is ~0. nonzero = [e for e in eigvals if e > 1e-3] if not qcdata.linear_mol and len(nonzero) == 2: qcdata.linear_mol = True if qcdata.linear_mol: # Use the largest two equal moments; physics needs a single B. B_cm = 16.857629 / nonzero[-1] # cm⁻¹ qcdata.roconst = [B_cm * 29.9792458] * 3 qcdata.rotemp = [HC_OVER_KB * B_cm] * 3 elif len(nonzero) == 3: roconst_cm = [16.857629 / I for I in nonzero] # noqa: E741 qcdata.roconst = [b * 29.9792458 for b in roconst_cm] qcdata.rotemp = [HC_OVER_KB * b for b in roconst_cm] # Job type: explicit override, else infer from frequency presence/sign if 'job_type' in info: qcdata.job_type = info['job_type'] else: if qcdata.im_frequency_wn: qcdata.job_type = 'TS' elif qcdata.frequency_wn: qcdata.job_type = 'Freq' else: qcdata.job_type = 'SP' return qcdata
[docs] def parse_qcdata(file): """Parse any supported output file into a QCData object. Detects program from file content and delegates to the correct parser. Parameters ---------- file : str Path to quantum chemistry output file. """ stub = os.path.splitext(file)[0] possible_filenames = (stub + '.log', stub + '.out', stub + '.extxyz', file) data = None actual_file = file for possible_filename in possible_filenames: if os.path.exists(possible_filename): actual_file = possible_filename with open(possible_filename, encoding='utf-8', errors='replace') as f: data = f.readlines() break if data is None: return QCData(file=file, program='unknown') # Detect program from first ~80 lines (Q-Chem banner is past the SLURM preamble) program = 'unknown' for line in data[:80]: if 'Gaussian' in line: program = 'Gaussian' break if '* O R C A *' in line: program = 'Orca' break if 'NWChem' in line: program = 'NWChem' break if 'x T B' in line or 'xtb version' in line: program = 'xtb' break if 'You are running Q-Chem' in line or 'Welcome to Q-Chem' in line: program = 'QChem' break # ASE extxyz: program=ase appears in the comment line (2nd line). if program == 'unknown' and len(data) >= 2 and 'program=ase' in data[1]: program = 'ase' if program == 'Gaussian': return parse_gaussian_thermo(actual_file) elif program == 'Orca': return parse_orca_thermo(actual_file) elif program == 'NWChem': return parse_nwchem_thermo(actual_file) elif program == 'xtb': return parse_xtb_thermo(actual_file) elif program == 'QChem': return parse_qchem_thermo(actual_file) elif program == 'ase': return parse_ase_thermo(actual_file) else: return QCData(file=file, program='unknown')