Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Jv branch #204

Merged
merged 5 commits into from
Jul 24, 2024
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
1. Multithrading in CSEARCH
  • Loading branch information
jvalegre committed Jul 24, 2024
commit 1aa23a4c1b50cd37e987e5e112f963e33b0e7e58
83 changes: 32 additions & 51 deletions aqme/csearch/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -302,7 +302,7 @@ def __init__(self, **kwargs):
self.args.log.write(f"\nStarting CSEARCH with {len(job_inputs)} job(s) (SDF, XYZ, CSV, etc. files might contain multiple jobs/structures inside)\n")

# runs the conformer sampling with multiprocessors
self.run_csearch(job_inputs)
_ = self.run_csearch(job_inputs)

# store all the information into a CSV file
csearch_file_no_path = (
Expand Down Expand Up @@ -388,71 +388,48 @@ def run_csearch(self, job_inputs):
"o Number of finished jobs from CSEARCH", max=len(job_inputs)
)

# rdkit benefits from using multithreading (the RMSD filter only uses 1 proc, when trying to use
# more the program collapses)
# rdkit benefits from using multithreading, since the RMSD filter in RDKit's GetBestRMS
# doesn't parallelize well (by default, it uses 1 thread and it fails when using more,
# and from our experience this function isn't efficient as we're not sure that
# it tries to use all the CPUs or only 1)
if self.args.program.lower() == "rdkit":
csearch_procs = self.args.nprocs
else:
else: # CREST already parallelizes CPUs
csearch_procs = 1

with futures.ProcessPoolExecutor(

# asynchronous multithreading to accelerate CSEARCH (only benefits RDKit)
with futures.ThreadPoolExecutor(
max_workers=csearch_procs,
) as executor:
# Submit a set of asynchronous jobs
jobs = []
# Submit the Jobs
for job_input in job_inputs:
(
smi_,
name_,
charge_,
mult_,
constraints_atoms_,
constraints_dist_,
constraints_angle_,
constraints_dihedral_,
complex_type_,
geom_
) = job_input
job = executor.submit(
self.compute_confs(
smi_,
name_,
charge_,
mult_,
constraints_atoms_,
constraints_dist_,
constraints_angle_,
constraints_dihedral_,
complex_type_,
geom_
)
_ = executor.submit(
self.compute_confs, job_input,bar
)
jobs.append(job)

bar.next()

bar.finish()
bar.finish()

def compute_confs(
self,
smi,
name,
charge,
mult,
constraints_atoms,
constraints_dist,
constraints_angle,
constraints_dihedral,
complex_type,
geom
):
def compute_confs(self,job_input,bar):
"""
Function to start conformer generation
"""

# load variables from job_input
(
smi,
name,
charge,
mult,
constraints_atoms,
constraints_dist,
constraints_angle,
constraints_dihedral,
complex_type,
geom
) = job_input

self.args.log.write(f"\n ----- {os.path.basename(Path(name))} -----")

# load mol and other parameters when using SMILES as input
if self.args.smi is not None or os.path.basename(Path(self.args.input)).split(".")[1] in ["smi","csv","cdx","txt","yaml","yml","rtf"]:
(
mol,
Expand All @@ -477,6 +454,7 @@ def compute_confs(
if os.path.basename(Path(self.args.input)).split(".")[1] not in ["csv","cdx","txt","yaml","yml","rtf"]:
self.args.log.finalize()
sys.exit()
bar.next()
return

else:
Expand All @@ -487,6 +465,7 @@ def compute_confs(
if os.path.basename(Path(self.args.input)).split(".")[1] not in ["csv","cdx","txt","yaml","yml","rtf"]:
self.args.log.finalize()
sys.exit()
bar.next()
return

# check if the optimization is constrained
Expand Down Expand Up @@ -551,6 +530,7 @@ def compute_confs(
if os.path.basename(Path(self.args.input)).split(".")[1] not in ["csv","cdx","txt","yaml","yml","rtf"]:
self.args.log.finalize()
sys.exit()
bar.next()
return

if complex_type in accepted_complex_types:
Expand Down Expand Up @@ -620,6 +600,7 @@ def compute_confs(
# Updates the dataframe with infromation about conformer generation
frames = [self.final_dup_data, total_data]
self.final_dup_data = pd.concat(frames, ignore_index=True, sort=True)
bar.next()

# automatic detection of metal atoms
def find_metal_atom(self,mol,charge,mult):
Expand Down
2 changes: 1 addition & 1 deletion aqme/qdescp.py
Original file line number Diff line number Diff line change
Expand Up @@ -374,7 +374,7 @@ def gather_files_and_run(self, destination, atom_props, update_atom_props, smart
bar = IncrementalBar(
"\no Number of finished jobs from QDESCP", max=len(self.args.files)
)
# multiprocessing to accelerate QDESCP (since xTB uses 1 processor to be reproducible)
# asynchronous multithreading to accelerate QDESCP (since xTB uses 1 processor to be reproducible)
with futures.ThreadPoolExecutor(
max_workers=self.args.nprocs,
) as executor:
Expand Down
2 changes: 1 addition & 1 deletion aqme/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -309,7 +309,7 @@ def get_conf_RMS(mol1, mol2, c1, c2, heavy, max_matches_rmsd):
if heavy:
mol1 = RemoveHs(mol1)
mol2 = RemoveHs(mol2)
return GetBestRMS(mol1, mol2, c1, c2, maxMatches=max_matches_rmsd) # don't use numThreads=0 or -1 as the documentation says, it fails! (due to multiprocessing?)
return GetBestRMS(mol1, mol2, c1, c2, maxMatches=max_matches_rmsd) # don't use numThreads=0 or -1 as the documentation says, it fails! (due to multithreading in CSEARCH module?)


def command_line_args():
Expand Down