-
Notifications
You must be signed in to change notification settings - Fork 1
/
gen_dude_conformers.py
executable file
·129 lines (103 loc) · 3.58 KB
/
gen_dude_conformers.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
#!/usr/bin/env python3
'''Given a smile, generate minimized RDKIT conformers using ETKDGv3.
Rank by energy and then filter by different rmsd thresholds.
'''
import numpy as np
import os, argparse,glob, sys
from rdkit.Chem import AllChem as Chem
import gzip, subprocess, multiprocessing
import traceback
parser = argparse.ArgumentParser()
parser.add_argument("--smi", type=str, required=True,help="smile file")
parser.add_argument("--maxconfs",type=int, default=250,help="max number of conformers to generate")
parser.add_argument("--seed",type=int,default=6262009,help="random seed")
parser.add_argument('--j',type=int,default=32,help='number of threads')
args = parser.parse_args()
base,_ = os.path.splitext(args.smi)
thresholds = [0, 0.5, 1.0, 1.5, 2.0]
sdwriters = []
outs = []
for t in thresholds:
if t == 0:
ename = f'{base}_rdkit_{args.maxconfs}_unranked.sdf.gz'
else:
ename = f'{base}_rdkit_{args.maxconfs}_filtered_{t}.sdf.gz'
outetkdg = gzip.open(ename,'wt')
outs.append(outetkdg)
sdwriteretkdg = Chem.SDWriter(outetkdg)
sdwriters.append(sdwriteretkdg)
def getRMS(mol, c1,c2):
rms = Chem.GetBestRMS(mol,mol,c1,c2)
return rms
def create_confs(line):
try:
toks = line.split()
smi = toks[0]
name = ' '.join(toks[1:])
pieces = smi.split('.')
if len(pieces) > 1:
smi = max(pieces, key=len) #take largest component by length
print("Taking largest component: %s\t%s" % (smi,name))
mol = Chem.MolFromSmiles(smi)
Chem.SanitizeMol(mol)
mol = Chem.AddHs(mol)
#etkdg
params = Chem.ETKDGv3()
params.randomSeed = args.seed
cids = Chem.EmbedMultipleConfs(mol, args.maxconfs, params)
cenergy = []
for conf in cids:
Chem.UFFOptimizeMolecule(mol,confId=conf)
cenergy.append(Chem.UFFGetMoleculeForceField(mol,confId=conf).CalcEnergy())
mol = Chem.RemoveHs(mol)
mol.SetProp("_Name",name);
sortedcids = sorted(cids,key = lambda cid: cenergy[cid])
ret = {}
for t in thresholds:
if t > 0:
selected = set()
energies = []
newmol = Chem.Mol(mol)
newmol.RemoveAllConformers()
for conf in sortedcids:
#check rmsd
passed = True
for seenconf in selected:
rms = getRMS(mol,seenconf,conf)
if rms < t:
break
else:
selected.add(conf)
energies.append(cenergy[conf])
newmol.AddConformer(mol.GetConformer(conf),assignId=True)
ret[t] = (newmol,name,energies)
else:
ret[t] = (mol,name,cenergy)
return ret
except:
print("Error",line)
traceback.print_exc()
pool = multiprocessing.Pool(args.j)
nummols = {t:0 for t in thresholds}
numconfs = {t:0 for t in thresholds}
for res in pool.imap_unordered(create_confs, open(args.smi,'rt')):
#I have no idea why the name property doesn't stick through pickling..
if res == None:
continue # error
for t,writer in zip(thresholds,sdwriters):
mol,name,energies = res[t]
mol.SetProp('_Name',name)
nummols[t] += 1
for cid in range(mol.GetNumConformers()):
mol.SetDoubleProp('energy',energies[cid])
writer.write(mol, confId=cid)
numconfs[t] += 1
for writer in sdwriters:
writer.close()
for t,out in zip(thresholds,outs):
cntname = out.name.replace('.sdf.gz','.cnt')
cout = open(cntname,'wt')
cout.write(f'NumConfs: {numconfs[t]}\n')
cout.write(f'NumMols: {nummols[t]}\n')
cout.close()
out.close()