Skip to content

Commit

Permalink
added data processor for hydrophobic and acid CAs
Browse files Browse the repository at this point in the history
  • Loading branch information
berylgithub committed Dec 23, 2019
1 parent 24a4d77 commit bb6db19
Show file tree
Hide file tree
Showing 2 changed files with 251 additions and 48 deletions.
95 changes: 47 additions & 48 deletions data_multi_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -306,58 +306,57 @@ def unit_test_y_data():



# '''
# files loader:
# '''
# path = 'C:/Users/beryl/Documents/Computational Science/Kanazawa/Thesis/Dataset/PP'
#
# complex_files = [f for f in os.listdir(path) if os.path.isfile(os.path.join(path, f))]
# print(len(complex_files))
#
## test_file = path+'/'+complex_files[2]
# test_file = path+'/1nez.ent.pdb'
# print(test_file)
'''
files loader:
'''
path = 'C:/Users/beryl/Documents/Computational Science/Kanazawa/Thesis/Dataset/PP'

complex_files = [f for f in os.listdir(path) if os.path.isfile(os.path.join(path, f))]
print(len(complex_files))

# test_file = path+'/'+complex_files[2]
test_file = path+'/1nez.ent.pdb'
print(test_file)
#
# '''
# atom dataframe generator:
# '''
# l =[]
# with open(test_file, 'r') as f:
# for line in f:
# if line.startswith('ATOM'):
# clean_line = (line.rstrip()).split()
# #check for alignment mistakes within data, a row with spacing alignment error has 11 length after splitted by whitespace
# if len(clean_line) == 11:
# #split the 2nd last column by the 4th index (this inference is according to PDB file formatting)
# split = [clean_line[-2][:4], clean_line[-2][4:]]
# clean_line[-2] = split[1]
# clean_line.insert(-2, split[0])
# #check if coordinate data collumns are collided (most likely happens between x and y coor)
# if len(clean_line[6])>=13:
# split = [clean_line[6][:-8], clean_line[6][-8:]]
# last_elem = clean_line.pop()
# clean_line[-1] = last_elem
# clean_line.insert(6, split[0])
# clean_line[7] = split[1]
# if len(clean_line[7])>=13:
# split = [clean_line[7][:-8], clean_line[7][-8:]]
# last_elem = clean_line.pop()
# clean_line[-1] = last_elem
# clean_line.insert(7, split[0])
# clean_line[8] = split[1]
# l.append(clean_line)
# elif line.startswith('TER'):
# clean_line = (line.rstrip()).split()
# l.append(clean_line)
# elif line.startswith('ENDMDL'):
# break
# df_atoms = (pd.DataFrame(l)).rename(columns={0:'record', 6:'x_coor', 7:'y_coor', 8:'z_coor', 11:'atom_type'})
# for i in range(len(l)):
# print(i, l[i])
#
# print(l[2013])
# print(len(l[293][6].split('-')))
# print(df_atoms)
l =[]
with open(test_file, 'r') as f:
for line in f:
if line.startswith('ATOM'):
clean_line = (line.rstrip()).split()
#check for alignment mistakes within data, a row with spacing alignment error has 11 length after splitted by whitespace
if len(clean_line) == 11:
#split the 2nd last column by the 4th index (this inference is according to PDB file formatting)
split = [clean_line[-2][:4], clean_line[-2][4:]]
clean_line[-2] = split[1]
clean_line.insert(-2, split[0])
#check if coordinate data collumns are collided (most likely happens between x and y coor)
if len(clean_line[6])>=13:
split = [clean_line[6][:-8], clean_line[6][-8:]]
last_elem = clean_line.pop()
clean_line[-1] = last_elem
clean_line.insert(6, split[0])
clean_line[7] = split[1]
if len(clean_line[7])>=13:
split = [clean_line[7][:-8], clean_line[7][-8:]]
last_elem = clean_line.pop()
clean_line[-1] = last_elem
clean_line.insert(7, split[0])
clean_line[8] = split[1]
l.append(clean_line)
elif line.startswith('TER'):
clean_line = (line.rstrip()).split()
l.append(clean_line)
elif line.startswith('ENDMDL'):
break
df_atoms = (pd.DataFrame(l)).rename(columns={0:'record', 2:'atom_name', 6:'x_coor', 7:'y_coor', 8:'z_coor', 11:'atom_type'})
for i in range(len(l)):
print(i, l[i])


print(df_atoms)
#
# '''
# split dataframes based on chains ended by "TER"
Expand Down
204 changes: 204 additions & 0 deletions data_processor_polar.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,204 @@
# -*- coding: utf-8 -*-
"""
Created on Fri Dec 20 13:19:00 2019
@author: Saint8312
"""
import numpy as np
import pandas as pd
import sys, os
import time
import multiprocessing
import itertools
import pickle
import data_checker
import data_multi_processor





def hydrophobic_acid_patch_interactions(paramlist):
'''
get the coordinates of CAs and then classify them based on the type of amino acid (hydrophobic, charged polar/acid), and then calculate the euclidean-heaviside as usual
- hydrophobics = ['ALA','VAL','ILE','LEU','MET','PHE','TYR','TRP']
- acids = ['ARG','HIS','LYS','ASP','GLU']
'''
path = paramlist[0]
id_name = paramlist[1]
cutoff = paramlist[2]
print("processing ",id_name)

path_file = path+'/'+id_name
l =[]
with open(path_file, 'r') as f:
for line in f:
if line.startswith('ATOM'):
clean_line = (line.rstrip()).split()
#check for alignment mistakes within data, a row with spacing alignment error has 11 length after splitted by whitespace
if len(clean_line) == 11:
#split the 2nd last column by the 4th index (this inference is according to PDB file formatting)
split = [clean_line[-2][:4], clean_line[-2][4:]]
clean_line[-2] = split[1]
clean_line.insert(-2, split[0])
#check if the chain identifier is misaligned
if len(clean_line[4])>1:
split = [clean_line[4][0], clean_line[4][1:]]
clean_line[4] = split[0]
clean_line.insert(5, split[1])
#check if coordinate data collumns are collided (most likely happens between x and y coor)
if len(clean_line[6])>=13:
split = [clean_line[6][:-8], clean_line[6][-8:]]
last_elem = clean_line.pop()
clean_line[-1] = last_elem
clean_line.insert(6, split[0])
clean_line[7] = split[1]
if len(clean_line[7])>=13:
split = [clean_line[7][:-8], clean_line[7][-8:]]
last_elem = clean_line.pop()
clean_line[-1] = last_elem
clean_line.insert(7, split[0])
clean_line[8] = split[1]
l.append(clean_line)
elif line.startswith('TER'):
clean_line = (line.rstrip()).split()
l.append(clean_line)
elif line.startswith('ENDMDL'):
break
df_atoms = (pd.DataFrame(l)).rename(columns={0:'record', 6:'x_coor', 7:'y_coor', 8:'z_coor', 11:'atom_type', 2:'atom_name', 3:'amino_acid_type'})

#dataframe splitter:
l_df = []
last_idx = 0
for idx in df_atoms.index[df_atoms['record'] == 'TER'].tolist():
l_df.append(df_atoms.iloc[last_idx:idx])
last_idx = idx+1

#hydrophobics and acids types of amino acids
hydrophobics = ['ALA','VAL','ILE','LEU','MET','PHE','TYR','TRP']
acids = ['ARG','HIS','LYS','ASP','GLU']

#select the carbon alpha of atoms based on the amino acid types
hydrophobics_patches = []
for i in range(len(l_df)):
mol_patch=l_df[i].set_index(['amino_acid_type'])
hydrophobics_patches.append(mol_patch.loc[ (mol_patch.index.isin(hydrophobics)) & (mol_patch['atom_name'] == 'CA') ])

acid_patches = []
for i in range(len(l_df)):
mol_patch=l_df[i].set_index(['amino_acid_type'])
acid_patches.append(mol_patch.loc[ (mol_patch.index.isin(acids)) & (mol_patch['atom_name'] == 'CA') ])

patches = [hydrophobics_patches, acid_patches]

#create the combination of protein patches interactions
x_vector = np.zeros(2)
patch_idx = 0
for patch in patches:
sum_interactions = 0
comb_ = itertools.combinations(patch, 2)
for c_ in list(comb_):
#function to calculate the distance-cutoff between CAs of two protein patches:
coors_0 = (c_[0][["x_coor", "y_coor", "z_coor"]]).to_numpy(dtype=float)
coors_1 = (c_[1][["x_coor", "y_coor", "z_coor"]]).to_numpy(dtype=float)
product_coors = np.array(list(itertools.product(coors_0, coors_1)))
# if pool:
# euclid_dists = pool.map(data_multi_processor.f_euc_mp, product_coors)
# euclid_dists = np.array(list(euclid_dists))
# paramlist = list(itertools.product(euclid_dists, [cutoff]))
# heavisides = pool.map(data_multi_processor.f_heaviside_mp, paramlist)
# heavisides = np.array(list(heavisides))
# else:
euclid_dists = np.array(list(map(data_multi_processor.f_euc_mp, product_coors)))
paramlist = list(itertools.product(euclid_dists, [cutoff]))
heavisides = np.array(list(map(data_multi_processor.f_heaviside_mp, paramlist)))
sum_interactions += np.sum(heavisides)
x_vector[patch_idx] = sum_interactions
patch_idx+=1
return {'id':id_name, 'h_a_vector':x_vector}


if __name__ == '__main__':
'''
create subset matrices from the dataset, the default matrices should be (N,81) where N is the total data
the subset will be (N, 16), taking only [C,N,O,S] atom types
'''
dataset = data_checker.data_load(os.getcwd()+'/dataset.pkl')
saved_id = [d['id'] for d in dataset]
# print('processed protein IDs = ',saved_id, print(len(saved_id)))

sorted_dataset = sorted(dataset, key = lambda k:k['id'])
atom_types = ['C','N','O','F','P','S','Cl','Br','I']
subset_atom_types = ['C','N','O','S']
subset_exclude = list(set(atom_types)-set(subset_atom_types))
paramlist = list(itertools.product(atom_types, atom_types))
# print(paramlist)
idx_l = []
for i in range(len(paramlist)):
result = not any(elem in paramlist[i] for elem in subset_exclude)
if result:
idx_l.append(i)

for i in range(len(sorted_dataset)):
sorted_dataset[i]['x_vector'] = sorted_dataset[i]['x_vector'][idx_l]

'''
hydrophobic and acid patches interactions calculation
'''
path = 'C:/Users/beryl/Documents/Computational Science/Kanazawa/Thesis/Dataset/PP'
complex_files = [f for f in os.listdir(path) if os.path.isfile(os.path.join(path, f))]
complexes = complex_files
cutoff = 12

filename = 'h_a_vec.pkl'

pool = multiprocessing.Pool()

#start of the process
start_time = time.time()

# paramlist = list(itertools.product([path], complexes, [cutoff]))
# h_a_vec = pool.map(hydrophobic_acid_patch_interactions, paramlist)
# h_a_vec = list(h_a_vec)

#check if id is already existed within file, if yes, skip it
data = []
try:
with open(filename, 'rb') as fr:
print(filename, 'is found')
try:
while True:
data.append(pickle.load(fr))
except EOFError:
pass
except FileNotFoundError:
print('File is not found')
saved_ids = [d['id'] for d in data]

try:
i=0
for id_file in complexes:
if id_file in saved_ids:
continue
else:
print("start of process for ID :",id_file)
paramlist = [path, id_file, cutoff]
h_a_vec = hydrophobic_acid_patch_interactions(paramlist)
print("ID : ", id_file)
print('value of x vector (R^N) = ', h_a_vec)
with open(filename, 'ab') as f:
pickle.dump(h_a_vec, f)
i+=1
except KeyboardInterrupt:
print('interrupted !!')


end_time = time.time()
print("the number of protein processed in current run = ",i)
print('time elapsed =',end_time-start_time,'seconds')

'''
data checker
'''
data = data_checker.data_load(filename)
print(data)

0 comments on commit bb6db19

Please sign in to comment.