MolSSI · loriab · Jun 11, 2022 · Jun 11, 2022 · Jun 12, 2022 · Jun 13, 2022
diff --git a/docs/source/changelog.rst b/docs/source/changelog.rst
@@ -17,14 +17,18 @@ Changelog
 .. +++++++++
 
 
-0.25.0 / 2022-MM-DD
+0.25.0 / 2022-06-DD
 -------------------
 
 Breaking Changes
 ++++++++++++++++
 
 New Features
 ++++++++++++
+- (:pr:`288`, :pr:`291`) The molecule ``from_string`` parser learned new format
+ ``dtype="gdb"`` from https://www.nature.com/articles/sdata201422/tables/3 .
+ At present, dtype must be specified explicitly to trigger this parsing; it is
+ not in the customary fallback sequence of unspecified dtype.
 
 Enhancements
 ++++++++++++
@@ -37,8 +41,9 @@ Enhancements
 
 Bug Fixes
 +++++++++
-- (:pr:`286`) Sphinx autodocumentation with typing of ``qcelemental.testing.compare_recursive`` no longer
- warns about circular import.
+- (:pr:`286`) Sphinx autodocumentation with typing of
+ ``qcelemental.testing.compare_recursive`` no longer warns about circular
+ import.
 
 
 0.24.0 / 2021-11-18

diff --git a/qcelemental/models/molecule.py b/qcelemental/models/molecule.py
@@ -870,7 +870,7 @@ def from_data(
  else:
  raise TypeError("Input type not understood, please supply the 'dtype' kwarg.")
 
- if dtype in ["string", "psi4", "xyz", "xyz+"]:
+ if dtype in ["string", "psi4", "xyz", "xyz+", "gdb"]:
  mol_dict = from_string(data, dtype if dtype != "string" else None)
  assert isinstance(mol_dict, dict)
  input_dict = to_schema(mol_dict["qm"], dtype=2, np_out=True)
@@ -946,7 +946,7 @@ def from_file(cls, filename: str, dtype: Optional[str] = None, *, orient: bool =
  dtype = "string"
 
  # Raw string type, read and pass through
- if dtype in ["string", "xyz", "xyz+", "psi4"]:
+ if dtype in ["string", "xyz", "xyz+", "psi4", "gdb"]:
  with open(filename, "r") as infile:
  data = infile.read()
  elif dtype == "numpy":

diff --git a/qcelemental/molparse/from_string.py b/qcelemental/molparse/from_string.py
@@ -1,6 +1,6 @@
 import pprint
 import re
-from typing import Dict, Tuple, Union
+from typing import Any, Dict, Tuple, Union
 
 from ..exceptions import ChoicesError, MoleculeFormatError, ValidationError
 from ..util import filter_comments, provenance_stamp
@@ -135,6 +135,35 @@ def from_string(
  -----
  <number of atoms> is pattern-matched but ignored.
 
+ gdb
+ ---
+
+ +-------------+------------------------------------------------------------------------------------+
+ | Line | Content https://www.nature.com/articles/sdata201422/tables/3 |
+ +=============+====================================================================================+
+ | 1 | <number of atoms>, nat |
+ +-------------+------------------------------------------------------------------------------------+
+ | 2 | Scalar properties (see https://www.nature.com/articles/sdata201422/tables/4) |
+ +-------------+------------------------------------------------------------------------------------+
+ | 3,...,nat+2 | Element type, coordinate (x, y, z, in Å), Mulliken partial charges (in e) on atoms |
+ +-------------+------------------------------------------------------------------------------------+
+ | nat+3 | Harmonic vibrational frequencies (3nat−5 or 3nat-6, in cm−1) |
+ +-------------+------------------------------------------------------------------------------------+
+ | nat+4 | SMILES strings from GDB-17 and from B3LYP relaxation |
+ +-------------+------------------------------------------------------------------------------------+
+ | nat+5 | InChI strings for Corina and B3LYP geometries |
+ +-------------+------------------------------------------------------------------------------------+
+
+ QM Domain
+ ---------
+ Specifiable: geom, elem (element identity)
+
+ Notes
+ -----
+ It's unclear what extras in the way of extra lines, columns and
+ fields are required, so enforcement vs. validation vs. ignore is
+ mixed in this routine and is subject to change.
+
  psi4 - Psi4 molecule {...} format
  ---------------------------------
 
@@ -173,16 +202,16 @@ def from_string(
 
  """
  if verbose >= 2:
- print("<<< FROM_STRING\n", molstr, "\n>>>")
+ print(f"<<< FROM_STRING: {dtype}\n", molstr, "\n>>>")
 
  # << 1 >> str-->str -- discard comments
  molstr = filter_comments(molstr.strip())
 
- def parse_as_xyz_ish(molstr, strict):
+ def parse_as_xyz_ish(molstr, strict, gdb):
  molinit = {}
 
  # << 2 >> str-->dict -- process atoms, units[, chg, mult]
- molstr, processed = _filter_xyz(molstr, strict=strict)
+ molstr, processed = _filter_xyz(molstr, strict=strict, gdb=gdb)
  molinit.update(processed)
 
  if molstr:
@@ -219,17 +248,20 @@ def parse_as_psi4_ish(molstr, unsettled):
  return molstr, molinit
 
  if dtype == "xyz":
- molstr, molinit = parse_as_xyz_ish(molstr, strict=True)
+ molstr, molinit = parse_as_xyz_ish(molstr, strict=True, gdb=False)
 
  elif dtype == "xyz+":
- molstr, molinit = parse_as_xyz_ish(molstr, strict=False)
+ molstr, molinit = parse_as_xyz_ish(molstr, strict=False, gdb=False)
 
  elif dtype == "psi4":
  molstr, molinit = parse_as_psi4_ish(molstr, unsettled=False)
 
  elif dtype == "psi4+":
  molstr, molinit = parse_as_psi4_ish(molstr, unsettled=True)
 
+ elif dtype == "gdb":
+ molstr, molinit = parse_as_xyz_ish(molstr, strict=True, gdb=True)
+
  elif dtype is None:
  dtype = "[psi4, xyz, xyz+, psi4+]" # for error message
  try:
@@ -239,14 +271,14 @@ def parse_as_psi4_ish(molstr, unsettled):
  min_error_length = len(str(e))
  min_error = e
  try:
- molstr, molinit = parse_as_xyz_ish(molstr, strict=True)
+ molstr, molinit = parse_as_xyz_ish(molstr, strict=True, gdb=False)
  dtype = "xyz"
  except MoleculeFormatError as e:
  if len(str(e)) < min_error_length:
  min_error_length = len(str(e))
  min_error = e
  try:
- molstr, molinit = parse_as_xyz_ish(molstr, strict=False)
+ molstr, molinit = parse_as_xyz_ish(molstr, strict=False, gdb=False)
  dtype = "xyz+"
  except MoleculeFormatError as e:
  if len(str(e)) < min_error_length:
@@ -259,7 +291,14 @@ def parse_as_psi4_ish(molstr, unsettled):
  if len(str(e)) < min_error_length:
  min_error_length = len(str(e))
  min_error = e
- raise min_error
+ try:
+ molstr, molinit = parse_as_xyz_ish(molstr, strict=True, gdb=True)
+ dtype = "gdb"
+ except MoleculeFormatError as e:
+ if len(str(e)) < min_error_length:
+ min_error_length = len(str(e))
+ min_error = e
+ raise min_error
  else:
  raise KeyError(f"Molecule: dtype of `{dtype}` not recognized.")
 
@@ -680,19 +719,27 @@ def process_variable(matchobj):
  r"\A" + r"(?P<nucleus>" + NUCLEUS + r")" + SEP + CARTXYZ + r"\Z", re.IGNORECASE | re.VERBOSE
 )
 
+xyz2_gdb = re.compile(r"\A" + r"gdb\d*" + SEP + r"(\d+)" + r"(" + SEP + NUMBER + r")*", re.IGNORECASE | re.VERBOSE)
+atom_cartesian_strict_gdb = re.compile(
+ r"\A" + r"(?P<nucleus>" + SIMPLENUCLEUS + r")" + SEP + CARTXYZ + SEP + NUMBER + r"\Z", re.IGNORECASE | re.VERBOSE
+)
+reNUMBER = r"(?x:" + NUMBER + ")"
 
-def _filter_xyz(string, strict):
+
+def _filter_xyz(string: str, *, strict: bool, gdb: bool) -> Tuple[str, Dict[str, Any]]:
  r"""Handle extracting atom, units, and chg/mult lines from `string`.
 
  Parameters
  ----------
- strict : bool
+ strict
  Whether to enforce a strict XYZ file format or to allow units, chg/mult,
  and add'l atom info.
+ gdb
+ Whether to expect GDB format with extra properties info.
 
  Returns
  -------
- str, dict
+ Tuple[str, Dict[str, Any]]
  Returns first a subset of `string` containing the unmatched contents.
  These are generally input violations unless handled by a subsequent
  processing function.
@@ -733,8 +780,39 @@ def process_atom_cartesian(matchobj):
  processed["geom"] = []
  processed["elbl"] = []
 
- if strict:
- for iln, line in enumerate(string.split("\n")):
+ splitstring = string.strip().split("\n")
+
+ if gdb:
+ nat = len(splitstring) - 5
+ for iln, line in enumerate(splitstring):
+ line = line.strip()
+ if iln == 0:
+ line = re.sub(xyz1strict, "", line)
+ elif iln == 1:
+ line = re.sub(xyz2_gdb, "", line)
+ elif iln == nat + 2:
+ nfr = 3 * (iln - 2) - 6
+ freqs = re.split(reNUMBER, line)
+ try:
+ freqs = [float(fr) for fr in freqs if fr.strip()]
+ except ValueError:
+ pass
+ else:
+ if len(freqs) == nfr or len(freqs) == nfr - 1:
+ line = ""
+ else:
+ line += f" ValidationError: {len(freqs)} != {nfr}"
+ elif iln == nat + 3:
+ line = ""
+ elif iln == nat + 4:
+ line = ""
+ else:
+ line = re.sub(atom_cartesian_strict_gdb, process_atom_cartesian, line)
+ if line:
+ reconstitute.append(line)
+
+ elif strict:
+ for iln, line in enumerate(splitstring):
  line = line.strip()
  if iln == 0:
  line = re.sub(xyz1strict, "", line)
@@ -745,7 +823,7 @@ def process_atom_cartesian(matchobj):
  if line:
  reconstitute.append(line)
  else:
- for iln, line in enumerate(string.split("\n")):
+ for iln, line in enumerate(splitstring):
  line = line.strip()
  if iln == 0:
  line = re.sub(xyz1, process_bohrang, line)