Source code for xtb_step.substep

# -*- coding: utf-8 -*-

"""A base class for substeps of the xTB plug-in.

All substeps (Energy, Optimization, Frequencies, and any future substeps such
as MD or metadynamics) inherit from ``Substep``. This base class holds the
machinery that is genuinely common across substeps:

* Module-level constants for method <-> CLI flag mapping and the supported
  solvent list per solvation model.
* A periodicity check that refuses periodic input with a clear message.
* A writer for the ``coord.xyz`` input file consumed by xtb.
* A builder for the common parts of the xtb command line (method, charge,
  multiplicity, accuracy, solvation, JSON output).
* A wrapper around the SEAMM executor for invoking the xtb binary.
* A JSON-output parser for ``xtbout.json``.
* A text parser for the few quantities not exposed in the JSON
  (notably the thermochemistry block from Hessian runs).

It deliberately does NOT bake in any assumption that a substep is
energy-related, so an MD substep, for example, can inherit directly from
``Substep`` without going through ``Energy``.
"""

import configparser
import importlib.resources
import json
import logging
from pathlib import Path
import pprint  # noqa: F401
import re
import shutil

import seamm
from seamm_util import ureg, Q_  # noqa: F401
import seamm_util.printing as printing

logger = logging.getLogger(__name__)
job = printing.getPrinter()
printer = printing.getPrinter("xTB")


# ---------------------------------------------------------------------------
# Method / model mapping
# ---------------------------------------------------------------------------

#: User-facing method label -> list of xtb CLI arguments.
METHOD_TO_CLI = {
    "GFN2-xTB": ["--gfn", "2"],
    "GFN1-xTB": ["--gfn", "1"],
    "GFN0-xTB": ["--gfn", "0"],
    "GFN-FF": ["--gfnff"],
}

#: All methods we expose. Used as the parameter enumeration.
METHODS = tuple(METHOD_TO_CLI)

# ---------------------------------------------------------------------------
# Solvation
# ---------------------------------------------------------------------------

#: Solvation model labels -> CLI flag.
SOLVATION_MODEL_TO_FLAG = {
    "none": None,
    "ALPB": "--alpb",
    "GBSA": "--gbsa",
    "CPCM-X": "--cpcmx",
}

#: Solvation models we expose. ``"none"`` means no implicit solvation.
SOLVATION_MODELS = tuple(SOLVATION_MODEL_TO_FLAG)

# ALPB and GBSA share the same parametrized solvent list. The list below is
# from the xTB documentation as of v6.7. Some solvents are method-specific:
# DMF and n-hexane are GFN2-only; benzene is GFN1-only. We expose the union
# here and let xtb itself reject mismatches with a clear error.
SOLVENTS_GBSA_ALPB = (
    "acetone",
    "acetonitrile",
    "benzene",  # GFN1 only
    "CH2Cl2",
    "CHCl3",
    "CS2",
    "DMF",  # GFN2 only
    "DMSO",
    "ether",
    "H2O",
    "methanol",
    "n-hexane",  # GFN2 only
    "THF",
    "toluene",
)

# CPCM-X uses the much larger Minnesota Solvation Database. The most common
# entries are listed; users can also pass arbitrary strings via the variable
# mechanism (the parameter enumeration is not strictly enforced).
SOLVENTS_CPCMX = (
    "acetone",
    "acetonitrile",
    "aniline",
    "benzaldehyde",
    "benzene",
    "CH2Cl2",
    "CHCl3",
    "CS2",
    "dioxane",
    "DMF",
    "DMSO",
    "ether",
    "ethylacetate",
    "furane",
    "hexadecane",
    "hexane",
    "methanol",
    "nitromethane",
    "octanol",
    "phenol",
    "thf",
    "toluene",
    "water",
    "octanol(wet)",
)



[docs]
def solvent_choices_for(model):
    """Return the supported solvent enumeration for a solvation model.

    Parameters
    ----------
    model : str
        The solvation-model label (one of :data:`SOLVATION_MODELS`).

    Returns
    -------
    tuple of str
        The list of solvent names we expose to the user. Empty for "none".
    """
    if model == "ALPB" or model == "GBSA":
        return SOLVENTS_GBSA_ALPB
    if model == "CPCM-X":
        return SOLVENTS_CPCMX
    return ()



# ---------------------------------------------------------------------------
# Substep base class
# ---------------------------------------------------------------------------



[docs]
class Substep(seamm.Node):
    """Common base class for substeps of the xTB plug-in.

    Subclasses are expected to:

    * Set ``self._calculation`` to a short identifier (e.g. ``"Energy"``).
    * Set ``self._model`` once the active xTB Hamiltonian is known
      (e.g. ``"GFN2-xTB"``).
    * Provide their own ``parameters``, ``description_text``, ``run``,
      and ``analyze`` methods.

    The helpers below are available to all subclasses:

    * :meth:`check_periodicity` -- raise on periodic input.
    * :meth:`write_coord_xyz` -- write the molecule to ``coord.xyz``.
    * :meth:`base_xtb_args` -- build the common part of the xtb CLI.
    * :meth:`run_xtb` -- invoke xtb via the SEAMM executor.
    * :meth:`read_xtbout_json` -- parse ``xtbout.json``.
    * :meth:`parse_thermo_block` -- pull the thermochemistry table from the
      text output.
    """

    def __init__(
        self,
        flowchart=None,
        title="no title",
        extension=None,
        logger=logger,
        module=__name__,
    ):
        """Initialize the Substep base class.

        Parameters
        ----------
        flowchart : seamm.Flowchart
            The non-graphical flowchart that contains this step.
        title : str
            The name displayed in the flowchart.
        extension : None
            Not yet implemented.
        logger : Logger
            The logger to use and pass to parent classes.
        module : str
            The module name, used by the SEAMM Node base.
        """
        self._input_only = False
        self._calculation = None
        self._model = None

        super().__init__(
            flowchart=flowchart,
            title=title,
            extension=extension,
            logger=logger,
        )

    @property
    def header(self):
        """A printable header for this section of output."""
        return "Step {}: {}".format(".".join(str(e) for e in self._id), self.title)

    @property
    def input_only(self):
        """Whether to write the input only, not run xtb."""
        return self._input_only

    @input_only.setter
    def input_only(self, value):
        self._input_only = value

    @property
    def is_runable(self):
        """Indicate whether this substep runs xtb or just adds input."""
        return True

    @property
    def global_options(self):
        """Pass-through to the parent (top-level xTB step)'s global options."""
        return self.parent.global_options

    @property
    def options(self):
        """Pass-through to the parent (top-level xTB step)'s options."""
        return self.parent.options

    # ------------------------------------------------------------------
    # Periodicity refusal
    # ------------------------------------------------------------------


[docs]
    def check_periodicity(self, configuration):
        """Refuse periodic input with a clear, user-visible message.

        xTB's PBC support is limited and is out of scope for v1 of this
        plug-in. If the configuration is periodic, this method prints a
        message via the step printer and raises ``RuntimeError`` so the
        flowchart stops cleanly.
        """
        if configuration.periodicity != 0:
            msg = (
                "The xtb_step plug-in does not currently support periodic "
                "systems. The configuration has periodicity "
                f"{configuration.periodicity}. Please use a molecular "
                "(non-periodic) configuration, or use a different plug-in "
                "(e.g. dftbplus_step) for periodic xTB-family calculations."
            )
            printer.important(msg)
            raise RuntimeError(msg)


    # ------------------------------------------------------------------
    # Coordinate writer
    # ------------------------------------------------------------------


[docs]
    def write_coord_xyz(self, directory, configuration, filename="coord.xyz"):
        """Write the configuration to a standard XYZ file for xtb."""
        directory = Path(directory)
        directory.mkdir(parents=True, exist_ok=True)
        path = directory / filename

        if hasattr(configuration, "to_xyz_text"):
            path.write_text(configuration.to_xyz_text())
        else:
            symbols = configuration.atoms.symbols
            coords = configuration.atoms.get_coordinates(fractionals=False)
            lines = [str(len(symbols)), ""]
            for sym, (x, y, z) in zip(symbols, coords):
                lines.append(f"{sym:<3s} {x:18.10f} {y:18.10f} {z:18.10f}")
            path.write_text("\n".join(lines) + "\n")

        return path


    # ------------------------------------------------------------------
    # CLI builders
    # ------------------------------------------------------------------


[docs]
    def base_xtb_args(self, P, configuration):
        """Build the parts of the xtb CLI common to all substeps.

        Charge and spin multiplicity are read from the configuration, not
        from ``P``. This matches the SEAMM convention (charge and spin are
        properties of the chemical system, not of the calculation) and
        lets high-throughput flowcharts loop over systems without per-
        system parameter editing.

        Parameters
        ----------
        P : dict
            The current parameter values (from
            ``parameters.current_values_to_dict()``). Expected keys include
            ``method``, ``accuracy``, ``solvation model``, ``solvent``.
        configuration : molsystem.Configuration
            Provides the net charge and spin multiplicity for the system.

        Returns
        -------
        list of str
            The argument list to prepend to the substep-specific arguments.
            Always begins with the coordinate filename (``coord.xyz``).
        """
        args = ["coord.xyz"]
        args += METHOD_TO_CLI[P["method"]]
        args += ["--json"]

        # Charge: from the configuration, always.
        charge = int(configuration.charge)
        args += ["--chrg", str(charge)]

        # Multiplicity / unpaired electrons. xtb takes --uhf with the number
        # of UNPAIRED electrons (M - 1 for spin multiplicity M).
        mult = int(configuration.spin_multiplicity)
        n_unpaired = max(mult - 1, 0)
        args += ["--uhf", str(n_unpaired)]

        # Accuracy
        if "accuracy" in P:
            try:
                acc = float(P["accuracy"])
            except (TypeError, ValueError):
                acc = 1.0
            if acc != 1.0:
                args += ["--acc", f"{acc:g}"]

        # Solvation
        smodel = P.get("solvation model", "none")
        if smodel != "none":
            flag = SOLVATION_MODEL_TO_FLAG[smodel]
            solvent = P.get("solvent", "H2O")
            args += [flag, str(solvent)]
            if smodel != "CPCM-X" and P["method"] == "GFN0-xTB":
                # ALPB/GBSA are not parametrized for GFN0-xTB.
                logger.warning(
                    "Solvation models ALPB/GBSA are not parametrized for "
                    "GFN0-xTB. xtb may emit a warning or refuse to run."
                )

        return args


    # ------------------------------------------------------------------
    # Executor wrapper
    # ------------------------------------------------------------------


[docs]
    def run_xtb(self, args, return_files=None, env=None):
        """Run xtb via the SEAMM executor.

        Reads the per-plug-in ``xtb.ini`` from the SEAMM root directory,
        falling back to the bundled ``data/xtb.ini`` template if the user
        does not yet have one. Then dispatches the run through the
        flowchart's executor (which handles conda/local/modules/docker
        transparently).
        """
        if return_files is None:
            return_files = []
        if env is None:
            env = {"OMP_NUM_THREADS": "1"}

        executor = self.parent.flowchart.executor
        executor_type = executor.name

        # Locate / bootstrap the xtb.ini configuration
        seamm_options = self.global_options
        ini_dir = Path(seamm_options["root"]).expanduser()
        ini_path = ini_dir / "xtb.ini"

        if not ini_path.exists():
            resources = importlib.resources.files("xtb_step") / "data"
            ini_text = (resources / "xtb.ini").read_text()
            ini_path.write_text(ini_text)
            logger.info(f"Bootstrapped default xtb.ini at {ini_path}")

        full_config = configparser.ConfigParser()
        full_config.read(ini_path)

        if executor_type not in full_config:
            path = shutil.which("xtb")
            if path is None:
                raise RuntimeError(
                    f"No section for '{executor_type}' in the xtb.ini "
                    f"({ini_path}), no defaults provided, and no 'xtb' "
                    "executable found on $PATH. Please install xtb (e.g. "
                    "via 'conda install -c conda-forge xtb') or edit "
                    f"{ini_path} to point at an xtb installation."
                )
            full_config.add_section(executor_type)
            full_config.set(executor_type, "installation", "local")
            full_config.set(executor_type, "code", str(path))
            with ini_path.open("w") as fd:
                full_config.write(fd)
            logger.info(
                f"Added a '{executor_type}' section to {ini_path} pointing "
                f"at {path}"
            )

        config = dict(full_config.items(executor_type))

        code = config.get("code", "xtb")
        cmd = [code, *args]

        result = executor.run(
            cmd=cmd,
            config=config,
            directory=self.directory,
            files={},
            return_files=return_files,
            in_situ=True,
            shell=True,
            env=env,
        )

        if not result:
            self.logger.error("There was an error running xtb")
            return None

        logger.debug("xtb run result:\n" + pprint.pformat(result))

        return result


    # ------------------------------------------------------------------
    # JSON parser
    # ------------------------------------------------------------------


[docs]
    def read_xtbout_json(self, directory=None, filename="xtbout.json"):
        """Read the xtb JSON output produced by ``--json``.

        xTB writes ``xtbout.json`` in the working directory when invoked with
        ``--json``. The schema includes (at least, in recent releases):
        ``"total energy"``, ``"HOMO-LUMO gap / eV"``, ``"electronic
        energy"``, ``"dipole / a.u."``, ``"partial charges"``, plus
        method-specific fields. The exact set varies between xtb versions;
        callers should treat the returned dict as best-effort.

        Parameters
        ----------
        directory : pathlib.Path, optional
            The directory containing the file. Defaults to ``self.directory``.
        filename : str
            The JSON filename to read. Defaults to ``xtbout.json``.

        Returns
        -------
        dict
            The parsed JSON data, or an empty dict if the file is missing
            or unreadable.
        """
        if directory is None:
            directory = self.directory
        path = Path(directory) / filename

        if not path.exists():
            logger.warning(f"xtb JSON output not found: {path}")
            return {}

        try:
            with path.open() as fd:
                return json.load(fd)
        except (OSError, json.JSONDecodeError) as exc:
            logger.warning(f"Could not parse xtb JSON output {path}: {exc}")
            return {}


    # ------------------------------------------------------------------
    # Text-output parsers
    # ------------------------------------------------------------------


[docs]
    def parse_thermo_block(self, stdout):
        """Pull thermochemistry quantities from xtb stdout.

        xtb's Hessian runs print a ``:: THERMODYNAMIC ::`` summary block
        and a per-temperature table. We extract the key scalars at a single
        temperature here. The format is::

            :: total free energy   -41.971849822766 Eh ::
            :: total energy        -42.153937303642 Eh ::
            :: zero point energy     0.182087480876 Eh ::
            :: G(RRHO) w/o ZPVE      0.000000000000 Eh ::
            :: G(RRHO) contrib.      0.182087480876 Eh ::

        and a temperature line like::

           T/K   H(0)-H(T)+PV   H(T)/Eh   T*S/Eh   G(T)/Eh
           298.15  ...   ...   ...   ...

        Energies are converted from E_h (xtb's native unit) to kJ/mol so
        the values returned here are chemist-friendly. The entropy
        contribution is reported both as ``entropy_term`` (T*S in
        kJ/mol, the form xtb prints) and as ``entropy`` (S in J/mol/K,
        the form that appears in standard tables and chemistry papers).

        Parameters
        ----------
        stdout : str
            The captured stdout (or contents of an ``xtb.out`` file) from a
            Hessian run.

        Returns
        -------
        dict
            Possibly containing keys: ``total_free_energy``,
            ``zero_point_energy``, ``temperature``, ``enthalpy``,
            ``entropy_term``, ``entropy``, ``gibbs_free_energy``. All
            energies in kJ/mol, ``entropy`` in J/mol/K, ``temperature``
            in K. Missing entries indicate parsing failed for that
            quantity; callers should handle absence gracefully.
        """
        out = {}
        eh_to_kjmol = (1.0 * ureg.hartree).to("kJ/mol").magnitude

        # Scalar lines from the THERMODYNAMIC summary block. Values in
        # the file are in E_h; convert to kJ/mol on the fly.
        scalar_patterns = {
            "total_free_energy": r":: total free energy\s+(-?\d+\.\d+)\s*Eh",
            "zero_point_energy": r":: zero point energy\s+(-?\d+\.\d+)\s*Eh",
        }
        for key, pat in scalar_patterns.items():
            m = re.search(pat, stdout)
            if m:
                try:
                    out[key] = float(m.group(1)) * eh_to_kjmol
                except ValueError:
                    pass

        # The per-temperature table: take the first data row after the
        # T/K header line.
        # Header looks roughly like:
        #   T/K     H(0)-H(T)+PV   H(T)/Eh    T*S/Eh    G(T)/Eh
        thermo_header = re.search(
            r"T/K\s+H\(0\)-H\(T\)\+PV\s+H\(T\)/Eh\s+T\*S/Eh\s+G\(T\)/Eh",
            stdout,
        )
        if thermo_header:
            tail = stdout[thermo_header.end() :]
            # Skip dash-separator lines, find first numeric row of 5 floats.
            for line in tail.splitlines():
                line = line.strip()
                if not line or set(line) <= {"-"}:
                    continue
                # Numbers may be in 0.123E+02 or plain decimal notation.
                num = r"-?\d+(?:\.\d+)?(?:[Ee][+-]?\d+)?"
                m = re.match(rf"({num})\s+({num})\s+({num})\s+({num})\s+({num})", line)
                if m:
                    try:
                        T = float(m.group(1))
                        H_eh = float(m.group(3))
                        TS_eh = float(m.group(4))
                        G_eh = float(m.group(5))
                    except ValueError:
                        break
                    out["temperature"] = T  # K
                    out["enthalpy"] = H_eh * eh_to_kjmol  # kJ/mol
                    out["entropy_term"] = TS_eh * eh_to_kjmol  # T*S as kJ/mol
                    out["gibbs_free_energy"] = G_eh * eh_to_kjmol  # kJ/mol
                    # Derive entropy itself in J/mol/K (chemist's preferred form).
                    if T > 0.0:
                        out["entropy"] = (TS_eh * eh_to_kjmol * 1000.0) / T
                    break

        return out