Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add gdb format molecule reading #291

Open
wants to merge 5 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
more gdb and error tests
  • Loading branch information
loriab committed Jun 13, 2022
commit 508817f08735fac6bb81254f7ad06bc2e8e6db22
40 changes: 32 additions & 8 deletions qcelemental/molparse/from_string.py
Original file line number Diff line number Diff line change
Expand Up @@ -154,6 +154,16 @@ def from_string(
| nat+5 | InChI strings for Corina and B3LYP geometries |
+-------------+------------------------------------------------------------------------------------+

QM Domain
---------
Specifiable: geom, elem (element identity)

Notes
-----
It's unclear what extras in the way of extra lines, columns and
fields are required, so enforcement vs. validation vs. ignore is
mixed in this routine and is subject to change.

psi4 - Psi4 molecule {...} format
---------------------------------

Expand Down Expand Up @@ -192,7 +202,7 @@ def from_string(

"""
if verbose >= 2:
print("<<< FROM_STRING\n", molstr, "\n>>>")
print(f"<<< FROM_STRING: {dtype}\n", molstr, "\n>>>")

# << 1 >> str-->str -- discard comments
molstr = filter_comments(molstr.strip())
Expand Down Expand Up @@ -281,7 +291,14 @@ def parse_as_psi4_ish(molstr, unsettled):
if len(str(e)) < min_error_length:
min_error_length = len(str(e))
min_error = e
raise min_error
try:
molstr, molinit = parse_as_xyz_ish(molstr, strict=True, gdb=True)
dtype = "gdb"
except MoleculeFormatError as e:
if len(str(e)) < min_error_length:
min_error_length = len(str(e))
min_error = e
raise min_error
else:
raise KeyError(f"Molecule: dtype of `{dtype}` not recognized.")

Expand Down Expand Up @@ -766,21 +783,28 @@ def process_atom_cartesian(matchobj):
splitstring = string.strip().split("\n")

if gdb:
nat = len(splitstring) - 5
for iln, line in enumerate(splitstring):
line = line.strip()
if iln == 0:
line = re.sub(xyz1strict, "", line)
elif iln == 1:
line = re.sub(xyz2_gdb, "", line)
elif iln == (len(splitstring) - 3):
elif iln == nat + 2:
nfr = 3 * (iln - 2) - 6
freqs = re.split(reNUMBER, line)
freqs = [float(fr) for fr in freqs if fr.strip()]
if len(freqs) == nfr or len(freqs) == nfr - 1:
line = ""
elif iln == (len(splitstring) - 2):
try:
freqs = [float(fr) for fr in freqs if fr.strip()]
except ValueError:
pass
else:
if len(freqs) == nfr or len(freqs) == nfr - 1:
line = ""
else:
line += f" ValidationError: {len(freqs)} != {nfr}"
elif iln == nat + 3:
line = ""
elif iln == (len(splitstring) - 1):
elif iln == nat + 4:
line = ""
else:
line = re.sub(atom_cartesian_strict_gdb, process_atom_cartesian, line)
Expand Down
56 changes: 53 additions & 3 deletions qcelemental/tests/test_molparse_from_string.py
Original file line number Diff line number Diff line change
Expand Up @@ -922,7 +922,7 @@ def test_xyzp_qm_7e():


@pytest.mark.parametrize(
"string",
"string,err",
[
pytest.param(
"""5
Expand All @@ -935,6 +935,7 @@ def test_xyzp_qm_7e():
1341.307 1341.3284 1341.365 1562.6731 1562.7453 3038.3205 3151.6034 3151.6788 3151.7078
C C
InChI=1S/CH4/h1H4 InChI=1S/CH4/h1H4""",
"H 1.0117308433 1.4637511618 0.0002765748",
id="gdb missing atom charge",
),
pytest.param(
Expand All @@ -948,6 +949,7 @@ def test_xyzp_qm_7e():
1341.307 1341.3284 1341.365 1562.6731 1562.7453 3038.3205 3151.6034 3151.6788 3151.7078
C C
InChI=1S/CH4/h1H4 InChI=1S/CH4/h1H4""",
"catinhat",
id="gdb bad properties line",
),
pytest.param(
Expand All @@ -961,6 +963,7 @@ def test_xyzp_qm_7e():
1341.307 1341.3284 1341.365 1562.6731 1562.7453 3038.3205 3151.6034
C C
InChI=1S/CH4/h1H4 InChI=1S/CH4/h1H4""",
"7 != 9",
id="gdb freq short",
),
pytest.param(
Expand All @@ -974,15 +977,62 @@ def test_xyzp_qm_7e():
1341.307 1341.3284 1341.365 1562.6731 1562.7453 3038.3205 3151.6034 3151.6788 3151.7078 12.0 12.0
C C
InChI=1S/CH4/h1H4 InChI=1S/CH4/h1H4""",
"11 != 9",
id="gdb freq long",
),
pytest.param(
"""13
gdb 4316 3.93756 2.02622 1.33821 5.7651 66.18 -0.217 -0.0372 0.1797 847.7575 0.097315 -394.821424 -394.81461 -394.813666 -394.852382 25.805
N -0.0143918233 1.3422664394 0.0973156565 -0.556275
C -0.0058800462 -0.013801326 0.0335018763 0.40951
N -1.1518605857 -0.6353642397 0.0216429738 -0.354471
C -1.1971682702 -1.9977380377 -6.8571*^-6 0.422062
N -2.2645393948 -2.6774072178 0.0000494887 -0.535907
O 0.0510929436 -2.7190221228 -0.03199627 -0.233941
C 1.1999026383 -2.0443363865 -0.0235842496 0.106292
C 1.2668995622 -0.7038697105 0.0104645383 -0.260464
H 0.8171330711 1.8677433962 -0.1059082712 0.263977
H -0.9056126713 1.7991786213 -0.012775232 0.271691
H -2.0483357305 -3.6722781589 -0.022437848 0.219518
H 2.0635548263 -2.70124622 -0.0466474882 0.140417
H 2.2155769702 -0.185937237 0.0247522526 0.107591
99.8509 199.7047 260.2158 350.8436 381.7756 506.8614 519.8583 550.6524 592.8235 714.629 718.3069 753.9357 778.0578 821.8909 824.4073 965.9996 968.3802 1015.0916 1097.1661 1193.0322 1240.2795 1315.9392 1375.6194 1496.0617 1588.1408 1632.8032 1711.5962 1761.6849 3190.1301 3229.6588 3516.943 3604.8213 3739.2263
NC1=NC(=N)OC=C1 NC1=CCOC(=N1)[NH]
InChI=1S/C4H5N3O/c5-3-1-2-8-4(6)7-3/h1-2H,(H3,5,6,7) InChI=1S/C4H6N3O/c5-3-1-2-8-4(6)7-3/h1,6H,2H2,(H2,5,7)""",
"-6.8571*^-6",
id="weird float format",
),
],
)
def test_xyz_gdb_error(string):
def test_xyz_gdb_error(string, err):

with pytest.raises(qcelemental.MoleculeFormatError):
with pytest.raises(qcelemental.MoleculeFormatError) as e:
qcelemental.molparse.from_string(string, return_processed=False, dtype="gdb")

assert err in str(e.value)

@pytest.mark.parametrize(
"dtype", [
None,
"xyz",
"xyz+",
]
)
def test_xyz_fields_error(dtype):

string = """3

O 8 0.0 0.0 0.0
H 1 1.0 0.0 0.0
H 1 0.0 1.0 0.0
"""

with pytest.raises(qcelemental.MoleculeFormatError) as e:
qcelemental.molparse.from_string(string, return_processed=False, dtype=dtype)

assert "O 8 0.0 0.0 0.0" in str(e.value)
# unprocessable string could be different for dtype=None as dtypes expand


@pytest.mark.parametrize(
"string,elem,geom",
Expand Down