-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
added data processor for hydrophobic and acid CAs
- Loading branch information
1 parent
24a4d77
commit bb6db19
Showing
2 changed files
with
251 additions
and
48 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,204 @@ | ||
# -*- coding: utf-8 -*- | ||
""" | ||
Created on Fri Dec 20 13:19:00 2019 | ||
@author: Saint8312 | ||
""" | ||
import numpy as np | ||
import pandas as pd | ||
import sys, os | ||
import time | ||
import multiprocessing | ||
import itertools | ||
import pickle | ||
import data_checker | ||
import data_multi_processor | ||
|
||
|
||
|
||
|
||
|
||
def hydrophobic_acid_patch_interactions(paramlist): | ||
''' | ||
get the coordinates of CAs and then classify them based on the type of amino acid (hydrophobic, charged polar/acid), and then calculate the euclidean-heaviside as usual | ||
- hydrophobics = ['ALA','VAL','ILE','LEU','MET','PHE','TYR','TRP'] | ||
- acids = ['ARG','HIS','LYS','ASP','GLU'] | ||
''' | ||
path = paramlist[0] | ||
id_name = paramlist[1] | ||
cutoff = paramlist[2] | ||
print("processing ",id_name) | ||
|
||
path_file = path+'/'+id_name | ||
l =[] | ||
with open(path_file, 'r') as f: | ||
for line in f: | ||
if line.startswith('ATOM'): | ||
clean_line = (line.rstrip()).split() | ||
#check for alignment mistakes within data, a row with spacing alignment error has 11 length after splitted by whitespace | ||
if len(clean_line) == 11: | ||
#split the 2nd last column by the 4th index (this inference is according to PDB file formatting) | ||
split = [clean_line[-2][:4], clean_line[-2][4:]] | ||
clean_line[-2] = split[1] | ||
clean_line.insert(-2, split[0]) | ||
#check if the chain identifier is misaligned | ||
if len(clean_line[4])>1: | ||
split = [clean_line[4][0], clean_line[4][1:]] | ||
clean_line[4] = split[0] | ||
clean_line.insert(5, split[1]) | ||
#check if coordinate data collumns are collided (most likely happens between x and y coor) | ||
if len(clean_line[6])>=13: | ||
split = [clean_line[6][:-8], clean_line[6][-8:]] | ||
last_elem = clean_line.pop() | ||
clean_line[-1] = last_elem | ||
clean_line.insert(6, split[0]) | ||
clean_line[7] = split[1] | ||
if len(clean_line[7])>=13: | ||
split = [clean_line[7][:-8], clean_line[7][-8:]] | ||
last_elem = clean_line.pop() | ||
clean_line[-1] = last_elem | ||
clean_line.insert(7, split[0]) | ||
clean_line[8] = split[1] | ||
l.append(clean_line) | ||
elif line.startswith('TER'): | ||
clean_line = (line.rstrip()).split() | ||
l.append(clean_line) | ||
elif line.startswith('ENDMDL'): | ||
break | ||
df_atoms = (pd.DataFrame(l)).rename(columns={0:'record', 6:'x_coor', 7:'y_coor', 8:'z_coor', 11:'atom_type', 2:'atom_name', 3:'amino_acid_type'}) | ||
|
||
#dataframe splitter: | ||
l_df = [] | ||
last_idx = 0 | ||
for idx in df_atoms.index[df_atoms['record'] == 'TER'].tolist(): | ||
l_df.append(df_atoms.iloc[last_idx:idx]) | ||
last_idx = idx+1 | ||
|
||
#hydrophobics and acids types of amino acids | ||
hydrophobics = ['ALA','VAL','ILE','LEU','MET','PHE','TYR','TRP'] | ||
acids = ['ARG','HIS','LYS','ASP','GLU'] | ||
|
||
#select the carbon alpha of atoms based on the amino acid types | ||
hydrophobics_patches = [] | ||
for i in range(len(l_df)): | ||
mol_patch=l_df[i].set_index(['amino_acid_type']) | ||
hydrophobics_patches.append(mol_patch.loc[ (mol_patch.index.isin(hydrophobics)) & (mol_patch['atom_name'] == 'CA') ]) | ||
|
||
acid_patches = [] | ||
for i in range(len(l_df)): | ||
mol_patch=l_df[i].set_index(['amino_acid_type']) | ||
acid_patches.append(mol_patch.loc[ (mol_patch.index.isin(acids)) & (mol_patch['atom_name'] == 'CA') ]) | ||
|
||
patches = [hydrophobics_patches, acid_patches] | ||
|
||
#create the combination of protein patches interactions | ||
x_vector = np.zeros(2) | ||
patch_idx = 0 | ||
for patch in patches: | ||
sum_interactions = 0 | ||
comb_ = itertools.combinations(patch, 2) | ||
for c_ in list(comb_): | ||
#function to calculate the distance-cutoff between CAs of two protein patches: | ||
coors_0 = (c_[0][["x_coor", "y_coor", "z_coor"]]).to_numpy(dtype=float) | ||
coors_1 = (c_[1][["x_coor", "y_coor", "z_coor"]]).to_numpy(dtype=float) | ||
product_coors = np.array(list(itertools.product(coors_0, coors_1))) | ||
# if pool: | ||
# euclid_dists = pool.map(data_multi_processor.f_euc_mp, product_coors) | ||
# euclid_dists = np.array(list(euclid_dists)) | ||
# paramlist = list(itertools.product(euclid_dists, [cutoff])) | ||
# heavisides = pool.map(data_multi_processor.f_heaviside_mp, paramlist) | ||
# heavisides = np.array(list(heavisides)) | ||
# else: | ||
euclid_dists = np.array(list(map(data_multi_processor.f_euc_mp, product_coors))) | ||
paramlist = list(itertools.product(euclid_dists, [cutoff])) | ||
heavisides = np.array(list(map(data_multi_processor.f_heaviside_mp, paramlist))) | ||
sum_interactions += np.sum(heavisides) | ||
x_vector[patch_idx] = sum_interactions | ||
patch_idx+=1 | ||
return {'id':id_name, 'h_a_vector':x_vector} | ||
|
||
|
||
if __name__ == '__main__': | ||
''' | ||
create subset matrices from the dataset, the default matrices should be (N,81) where N is the total data | ||
the subset will be (N, 16), taking only [C,N,O,S] atom types | ||
''' | ||
dataset = data_checker.data_load(os.getcwd()+'/dataset.pkl') | ||
saved_id = [d['id'] for d in dataset] | ||
# print('processed protein IDs = ',saved_id, print(len(saved_id))) | ||
|
||
sorted_dataset = sorted(dataset, key = lambda k:k['id']) | ||
atom_types = ['C','N','O','F','P','S','Cl','Br','I'] | ||
subset_atom_types = ['C','N','O','S'] | ||
subset_exclude = list(set(atom_types)-set(subset_atom_types)) | ||
paramlist = list(itertools.product(atom_types, atom_types)) | ||
# print(paramlist) | ||
idx_l = [] | ||
for i in range(len(paramlist)): | ||
result = not any(elem in paramlist[i] for elem in subset_exclude) | ||
if result: | ||
idx_l.append(i) | ||
|
||
for i in range(len(sorted_dataset)): | ||
sorted_dataset[i]['x_vector'] = sorted_dataset[i]['x_vector'][idx_l] | ||
|
||
''' | ||
hydrophobic and acid patches interactions calculation | ||
''' | ||
path = 'C:/Users/beryl/Documents/Computational Science/Kanazawa/Thesis/Dataset/PP' | ||
complex_files = [f for f in os.listdir(path) if os.path.isfile(os.path.join(path, f))] | ||
complexes = complex_files | ||
cutoff = 12 | ||
|
||
filename = 'h_a_vec.pkl' | ||
|
||
pool = multiprocessing.Pool() | ||
|
||
#start of the process | ||
start_time = time.time() | ||
|
||
# paramlist = list(itertools.product([path], complexes, [cutoff])) | ||
# h_a_vec = pool.map(hydrophobic_acid_patch_interactions, paramlist) | ||
# h_a_vec = list(h_a_vec) | ||
|
||
#check if id is already existed within file, if yes, skip it | ||
data = [] | ||
try: | ||
with open(filename, 'rb') as fr: | ||
print(filename, 'is found') | ||
try: | ||
while True: | ||
data.append(pickle.load(fr)) | ||
except EOFError: | ||
pass | ||
except FileNotFoundError: | ||
print('File is not found') | ||
saved_ids = [d['id'] for d in data] | ||
|
||
try: | ||
i=0 | ||
for id_file in complexes: | ||
if id_file in saved_ids: | ||
continue | ||
else: | ||
print("start of process for ID :",id_file) | ||
paramlist = [path, id_file, cutoff] | ||
h_a_vec = hydrophobic_acid_patch_interactions(paramlist) | ||
print("ID : ", id_file) | ||
print('value of x vector (R^N) = ', h_a_vec) | ||
with open(filename, 'ab') as f: | ||
pickle.dump(h_a_vec, f) | ||
i+=1 | ||
except KeyboardInterrupt: | ||
print('interrupted !!') | ||
|
||
|
||
end_time = time.time() | ||
print("the number of protein processed in current run = ",i) | ||
print('time elapsed =',end_time-start_time,'seconds') | ||
|
||
''' | ||
data checker | ||
''' | ||
data = data_checker.data_load(filename) | ||
print(data) |