Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add gdb format molecule reading #291

Open
wants to merge 5 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Next Next commit
add gdb parsing
  • Loading branch information
loriab committed Jun 11, 2022
commit 98e21e7a2761b450e5f0c261def52450e34a4f39
80 changes: 67 additions & 13 deletions qcelemental/molparse/from_string.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import pprint
import re
from typing import Dict, Tuple, Union
from typing import Any, Dict, Tuple, Union

from ..exceptions import ChoicesError, MoleculeFormatError, ValidationError
from ..util import filter_comments, provenance_stamp
Expand Down Expand Up @@ -135,6 +135,25 @@ def from_string(
-----
<number of atoms> is pattern-matched but ignored.

gdb
---

+-------------+------------------------------------------------------------------------------------+
| Line | Content https://www.nature.com/articles/sdata201422/tables/3 |
+=============+====================================================================================+
| 1 | <number of atoms>, nat |
+-------------+------------------------------------------------------------------------------------+
| 2 | Scalar properties (see https://www.nature.com/articles/sdata201422/tables/4) |
+-------------+------------------------------------------------------------------------------------+
| 3,...,nat+2 | Element type, coordinate (x, y, z, in Å), Mulliken partial charges (in e) on atoms |
+-------------+------------------------------------------------------------------------------------+
| nat+3 | Harmonic vibrational frequencies (3nat−5 or 3nat-6, in cm−1) |
+-------------+------------------------------------------------------------------------------------+
| nat+4 | SMILES strings from GDB-17 and from B3LYP relaxation |
+-------------+------------------------------------------------------------------------------------+
| nat+5 | InChI strings for Corina and B3LYP geometries |
+-------------+------------------------------------------------------------------------------------+

psi4 - Psi4 molecule {...} format
---------------------------------

Expand Down Expand Up @@ -178,11 +197,11 @@ def from_string(
# << 1 >> str-->str -- discard comments
molstr = filter_comments(molstr.strip())

def parse_as_xyz_ish(molstr, strict):
def parse_as_xyz_ish(molstr, strict, gdb):
molinit = {}

# << 2 >> str-->dict -- process atoms, units[, chg, mult]
molstr, processed = _filter_xyz(molstr, strict=strict)
molstr, processed = _filter_xyz(molstr, strict=strict, gdb=gdb)
molinit.update(processed)

if molstr:
Expand Down Expand Up @@ -219,17 +238,20 @@ def parse_as_psi4_ish(molstr, unsettled):
return molstr, molinit

if dtype == "xyz":
molstr, molinit = parse_as_xyz_ish(molstr, strict=True)
molstr, molinit = parse_as_xyz_ish(molstr, strict=True, gdb=False)

elif dtype == "xyz+":
molstr, molinit = parse_as_xyz_ish(molstr, strict=False)
molstr, molinit = parse_as_xyz_ish(molstr, strict=False, gdb=False)

elif dtype == "psi4":
molstr, molinit = parse_as_psi4_ish(molstr, unsettled=False)

elif dtype == "psi4+":
molstr, molinit = parse_as_psi4_ish(molstr, unsettled=True)

elif dtype == "gdb":
molstr, molinit = parse_as_xyz_ish(molstr, strict=True, gdb=True)

elif dtype is None:
dtype = "[psi4, xyz, xyz+, psi4+]" # for error message
try:
Expand All @@ -239,14 +261,14 @@ def parse_as_psi4_ish(molstr, unsettled):
min_error_length = len(str(e))
min_error = e
try:
molstr, molinit = parse_as_xyz_ish(molstr, strict=True)
molstr, molinit = parse_as_xyz_ish(molstr, strict=True, gdb=False)
dtype = "xyz"
except MoleculeFormatError as e:
if len(str(e)) < min_error_length:
min_error_length = len(str(e))
min_error = e
try:
molstr, molinit = parse_as_xyz_ish(molstr, strict=False)
molstr, molinit = parse_as_xyz_ish(molstr, strict=False, gdb=False)
dtype = "xyz+"
except MoleculeFormatError as e:
if len(str(e)) < min_error_length:
Expand Down Expand Up @@ -680,19 +702,27 @@ def process_variable(matchobj):
r"\A" + r"(?P<nucleus>" + NUCLEUS + r")" + SEP + CARTXYZ + r"\Z", re.IGNORECASE | re.VERBOSE
)

xyz2_gdb = re.compile(r"\A" + r"gdb\d*" + SEP + r"(\d+)" + r"(" + SEP + NUMBER + r")*", re.IGNORECASE | re.VERBOSE)
atom_cartesian_strict_gdb = re.compile(
r"\A" + r"(?P<nucleus>" + SIMPLENUCLEUS + r")" + SEP + CARTXYZ + SEP + NUMBER + r"\Z", re.IGNORECASE | re.VERBOSE
)
reNUMBER = r"(?x:" + NUMBER + ")"


def _filter_xyz(string, strict):
def _filter_xyz(string: str, *, strict: bool, gdb: bool) -> Tuple[str, Dict[str, Any]]:
r"""Handle extracting atom, units, and chg/mult lines from `string`.

Parameters
----------
strict : bool
strict
Whether to enforce a strict XYZ file format or to allow units, chg/mult,
and add'l atom info.
gdb
Whether to expect GDB format with extra properties info.

Returns
-------
str, dict
Tuple[str, Dict[str, Any]]
Returns first a subset of `string` containing the unmatched contents.
These are generally input violations unless handled by a subsequent
processing function.
Expand Down Expand Up @@ -733,8 +763,32 @@ def process_atom_cartesian(matchobj):
processed["geom"] = []
processed["elbl"] = []

if strict:
for iln, line in enumerate(string.split("\n")):
splitstring = string.strip().split("\n")

if gdb:
for iln, line in enumerate(splitstring):
line = line.strip()
if iln == 0:
line = re.sub(xyz1strict, "", line)
elif iln == 1:
line = re.sub(xyz2_gdb, "", line)
elif iln == (len(splitstring) - 3):
nfr = 3 * (iln - 2) - 6
freqs = re.split(reNUMBER, line)
freqs = [float(fr) for fr in freqs if fr.strip()]
if len(freqs) == nfr or len(freqs) == nfr - 1:
line = ""
elif iln == (len(splitstring) - 2):
line = ""
elif iln == (len(splitstring) - 1):
line = ""
else:
line = re.sub(atom_cartesian_strict_gdb, process_atom_cartesian, line)
if line:
reconstitute.append(line)

elif strict:
for iln, line in enumerate(splitstring):
line = line.strip()
if iln == 0:
line = re.sub(xyz1strict, "", line)
Expand All @@ -745,7 +799,7 @@ def process_atom_cartesian(matchobj):
if line:
reconstitute.append(line)
else:
for iln, line in enumerate(string.split("\n")):
for iln, line in enumerate(splitstring):
line = line.strip()
if iln == 0:
line = re.sub(xyz1, process_bohrang, line)
Expand Down
Loading