Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add gdb format molecule reading #291

Open
wants to merge 5 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 8 additions & 3 deletions docs/source/changelog.rst
Original file line number Diff line number Diff line change
Expand Up @@ -17,14 +17,18 @@ Changelog
.. +++++++++


0.25.0 / 2022-MM-DD
0.25.0 / 2022-06-DD
-------------------

Breaking Changes
++++++++++++++++

New Features
++++++++++++
- (:pr:`288`, :pr:`291`) The molecule ``from_string`` parser learned new format
``dtype="gdb"`` from https://www.nature.com/articles/sdata201422/tables/3 .
At present, dtype must be specified explicitly to trigger this parsing; it is
not in the customary fallback sequence of unspecified dtype.

Enhancements
++++++++++++
Expand All @@ -37,8 +41,9 @@ Enhancements

Bug Fixes
+++++++++
- (:pr:`286`) Sphinx autodocumentation with typing of ``qcelemental.testing.compare_recursive`` no longer
warns about circular import.
- (:pr:`286`) Sphinx autodocumentation with typing of
``qcelemental.testing.compare_recursive`` no longer warns about circular
import.


0.24.0 / 2021-11-18
Expand Down
4 changes: 2 additions & 2 deletions qcelemental/models/molecule.py
Original file line number Diff line number Diff line change
Expand Up @@ -870,7 +870,7 @@ def from_data(
else:
raise TypeError("Input type not understood, please supply the 'dtype' kwarg.")

if dtype in ["string", "psi4", "xyz", "xyz+"]:
if dtype in ["string", "psi4", "xyz", "xyz+", "gdb"]:
mol_dict = from_string(data, dtype if dtype != "string" else None)
assert isinstance(mol_dict, dict)
input_dict = to_schema(mol_dict["qm"], dtype=2, np_out=True)
Expand Down Expand Up @@ -946,7 +946,7 @@ def from_file(cls, filename: str, dtype: Optional[str] = None, *, orient: bool =
dtype = "string"

# Raw string type, read and pass through
if dtype in ["string", "xyz", "xyz+", "psi4"]:
if dtype in ["string", "xyz", "xyz+", "psi4", "gdb"]:
with open(filename, "r") as infile:
data = infile.read()
elif dtype == "numpy":
Expand Down
108 changes: 93 additions & 15 deletions qcelemental/molparse/from_string.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import pprint
import re
from typing import Dict, Tuple, Union
from typing import Any, Dict, Tuple, Union

from ..exceptions import ChoicesError, MoleculeFormatError, ValidationError
from ..util import filter_comments, provenance_stamp
Expand Down Expand Up @@ -135,6 +135,35 @@ def from_string(
-----
<number of atoms> is pattern-matched but ignored.

gdb
---

+-------------+------------------------------------------------------------------------------------+
| Line | Content https://www.nature.com/articles/sdata201422/tables/3 |
+=============+====================================================================================+
| 1 | <number of atoms>, nat |
+-------------+------------------------------------------------------------------------------------+
| 2 | Scalar properties (see https://www.nature.com/articles/sdata201422/tables/4) |
+-------------+------------------------------------------------------------------------------------+
| 3,...,nat+2 | Element type, coordinate (x, y, z, in Å), Mulliken partial charges (in e) on atoms |
+-------------+------------------------------------------------------------------------------------+
| nat+3 | Harmonic vibrational frequencies (3nat−5 or 3nat-6, in cm−1) |
+-------------+------------------------------------------------------------------------------------+
| nat+4 | SMILES strings from GDB-17 and from B3LYP relaxation |
+-------------+------------------------------------------------------------------------------------+
| nat+5 | InChI strings for Corina and B3LYP geometries |
+-------------+------------------------------------------------------------------------------------+

QM Domain
---------
Specifiable: geom, elem (element identity)

Notes
-----
It's unclear what extras in the way of extra lines, columns and
fields are required, so enforcement vs. validation vs. ignore is
mixed in this routine and is subject to change.

psi4 - Psi4 molecule {...} format
---------------------------------

Expand Down Expand Up @@ -173,16 +202,16 @@ def from_string(

"""
if verbose >= 2:
print("<<< FROM_STRING\n", molstr, "\n>>>")
print(f"<<< FROM_STRING: {dtype}\n", molstr, "\n>>>")

# << 1 >> str-->str -- discard comments
molstr = filter_comments(molstr.strip())

def parse_as_xyz_ish(molstr, strict):
def parse_as_xyz_ish(molstr, strict, gdb):
molinit = {}

# << 2 >> str-->dict -- process atoms, units[, chg, mult]
molstr, processed = _filter_xyz(molstr, strict=strict)
molstr, processed = _filter_xyz(molstr, strict=strict, gdb=gdb)
molinit.update(processed)

if molstr:
Expand Down Expand Up @@ -219,17 +248,20 @@ def parse_as_psi4_ish(molstr, unsettled):
return molstr, molinit

if dtype == "xyz":
molstr, molinit = parse_as_xyz_ish(molstr, strict=True)
molstr, molinit = parse_as_xyz_ish(molstr, strict=True, gdb=False)

elif dtype == "xyz+":
molstr, molinit = parse_as_xyz_ish(molstr, strict=False)
molstr, molinit = parse_as_xyz_ish(molstr, strict=False, gdb=False)

elif dtype == "psi4":
molstr, molinit = parse_as_psi4_ish(molstr, unsettled=False)

elif dtype == "psi4+":
molstr, molinit = parse_as_psi4_ish(molstr, unsettled=True)

elif dtype == "gdb":
molstr, molinit = parse_as_xyz_ish(molstr, strict=True, gdb=True)

elif dtype is None:
dtype = "[psi4, xyz, xyz+, psi4+]" # for error message
try:
Expand All @@ -239,14 +271,14 @@ def parse_as_psi4_ish(molstr, unsettled):
min_error_length = len(str(e))
min_error = e
try:
molstr, molinit = parse_as_xyz_ish(molstr, strict=True)
molstr, molinit = parse_as_xyz_ish(molstr, strict=True, gdb=False)
dtype = "xyz"
except MoleculeFormatError as e:
if len(str(e)) < min_error_length:
min_error_length = len(str(e))
min_error = e
try:
molstr, molinit = parse_as_xyz_ish(molstr, strict=False)
molstr, molinit = parse_as_xyz_ish(molstr, strict=False, gdb=False)
dtype = "xyz+"
except MoleculeFormatError as e:
if len(str(e)) < min_error_length:
Expand All @@ -259,7 +291,14 @@ def parse_as_psi4_ish(molstr, unsettled):
if len(str(e)) < min_error_length:
min_error_length = len(str(e))
min_error = e
raise min_error
try:
molstr, molinit = parse_as_xyz_ish(molstr, strict=True, gdb=True)
dtype = "gdb"
except MoleculeFormatError as e:
if len(str(e)) < min_error_length:
min_error_length = len(str(e))
min_error = e
raise min_error
else:
raise KeyError(f"Molecule: dtype of `{dtype}` not recognized.")

Expand Down Expand Up @@ -680,19 +719,27 @@ def process_variable(matchobj):
r"\A" + r"(?P<nucleus>" + NUCLEUS + r")" + SEP + CARTXYZ + r"\Z", re.IGNORECASE | re.VERBOSE
)

xyz2_gdb = re.compile(r"\A" + r"gdb\d*" + SEP + r"(\d+)" + r"(" + SEP + NUMBER + r")*", re.IGNORECASE | re.VERBOSE)
atom_cartesian_strict_gdb = re.compile(
r"\A" + r"(?P<nucleus>" + SIMPLENUCLEUS + r")" + SEP + CARTXYZ + SEP + NUMBER + r"\Z", re.IGNORECASE | re.VERBOSE
)
reNUMBER = r"(?x:" + NUMBER + ")"

def _filter_xyz(string, strict):

def _filter_xyz(string: str, *, strict: bool, gdb: bool) -> Tuple[str, Dict[str, Any]]:
r"""Handle extracting atom, units, and chg/mult lines from `string`.

Parameters
----------
strict : bool
strict
Whether to enforce a strict XYZ file format or to allow units, chg/mult,
and add'l atom info.
gdb
Whether to expect GDB format with extra properties info.

Returns
-------
str, dict
Tuple[str, Dict[str, Any]]
Returns first a subset of `string` containing the unmatched contents.
These are generally input violations unless handled by a subsequent
processing function.
Expand Down Expand Up @@ -733,8 +780,39 @@ def process_atom_cartesian(matchobj):
processed["geom"] = []
processed["elbl"] = []

if strict:
for iln, line in enumerate(string.split("\n")):
splitstring = string.strip().split("\n")

if gdb:
nat = len(splitstring) - 5
for iln, line in enumerate(splitstring):
line = line.strip()
if iln == 0:
line = re.sub(xyz1strict, "", line)
elif iln == 1:
line = re.sub(xyz2_gdb, "", line)
elif iln == nat + 2:
nfr = 3 * (iln - 2) - 6
freqs = re.split(reNUMBER, line)
try:
freqs = [float(fr) for fr in freqs if fr.strip()]
except ValueError:
pass
else:
if len(freqs) == nfr or len(freqs) == nfr - 1:
line = ""
else:
line += f" ValidationError: {len(freqs)} != {nfr}"
elif iln == nat + 3:
line = ""
elif iln == nat + 4:
line = ""
else:
line = re.sub(atom_cartesian_strict_gdb, process_atom_cartesian, line)
if line:
reconstitute.append(line)

elif strict:
for iln, line in enumerate(splitstring):
line = line.strip()
if iln == 0:
line = re.sub(xyz1strict, "", line)
Expand All @@ -745,7 +823,7 @@ def process_atom_cartesian(matchobj):
if line:
reconstitute.append(line)
else:
for iln, line in enumerate(string.split("\n")):
for iln, line in enumerate(splitstring):
line = line.strip()
if iln == 0:
line = re.sub(xyz1, process_bohrang, line)
Expand Down
Loading