Skip to content

Commit

Permalink
completed feature calculator functions (no tests yet)
Browse files Browse the repository at this point in the history
  • Loading branch information
berylgithub committed Jan 14, 2020
1 parent f5b9753 commit 6505a5f
Showing 1 changed file with 168 additions and 9 deletions.
177 changes: 168 additions & 9 deletions feature_calculator.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@

import numpy as np
import pandas as pd
import itertools

'''
math functions
Expand Down Expand Up @@ -67,23 +68,22 @@ def f_proteins_interaction(df_protein_A, df_protein_B, atom_types, cutoff):
for b_type in atom_types:
#calculate the interaction of each atoms:
sum_interaction = 0
a_atoms = df_protein_A.loc[df_protein_A['atom_type'] == a_type]
b_atoms = df_protein_B.loc[df_protein_B['atom_type'] == b_type]
a_atoms = df_protein_A.loc[df_protein_A['element_symbol'] == a_type]
b_atoms = df_protein_B.loc[df_protein_B['element_symbol'] == b_type]
for i in range(a_atoms.shape[0]):
for j in range(b_atoms.shape[0]):
#get the (x,y,z):
a_atom = a_atoms.iloc[i]
b_atom = b_atoms.iloc[j]
a_coord = np.array([float(a_atom['x_coor']), float(a_atom['y_coor']), float(a_atom['z_coor'])])
b_coord = np.array([float(b_atom['x_coor']), float(b_atom['y_coor']), float(b_atom['z_coor'])])
a_coord = np.array([float(a_atom['x_coord']), float(a_atom['y_coord']), float(a_atom['z_coord'])])
b_coord = np.array([float(b_atom['x_coord']), float(b_atom['y_coord']), float(b_atom['z_coord'])])
#calculate the euclidean distance and heaviside step value:
sum_interaction += f_h_step(x=f_euclid_dist(a_coord, b_coord), a=cutoff)
x_vector[idx] = sum_interaction
idx+=1
print(x_vector)
return x_vector

def x_processor(chains, id_name, atom_types, cutoff):
def x_atom_dist(chains, id_name, atom_types, cutoff):
#vector calculation:
x_vector = np.zeros(len(atom_types)**2)
length = len(chains)
Expand All @@ -95,12 +95,171 @@ def x_processor(chains, id_name, atom_types, cutoff):
x_vector += f_proteins_interaction(chains[i], chains[j], atom_types, cutoff)
return {'id':id_name, 'x_vector':x_vector}

def x_hydrophobic_acid(chains, id_name, cutoff):
'''
get the coordinates of CAs and then classify them based on the type of amino acid (hydrophobic, charged polar/acid), and then calculate the euclidean-heaviside as usual
- hydrophobics = ['ALA','VAL','ILE','LEU','MET','PHE','TYR','TRP']
- acids = ['ARG','HIS','LYS','ASP','GLU']
'''

print("processing ",id_name)


#hydrophobics and acids types of amino acids
hydrophobics = ['ALA','VAL','ILE','LEU','MET','PHE','TYR','TRP']
acids = ['ARG','HIS','LYS','ASP','GLU']

#select the carbon alpha of atoms based on the amino acid types
hydrophobics_patches = []
for i in range(len(chains)):
mol_patch=chains[i].set_index(['residue_name'])
hydrophobics_patches.append(mol_patch.loc[ (mol_patch.index.isin(hydrophobics)) & (mol_patch['atom_name'] == 'CA') ])

acid_patches = []
for i in range(len(chains)):
mol_patch=chains[i].set_index(['residue_name'])
acid_patches.append(mol_patch.loc[ (mol_patch.index.isin(acids)) & (mol_patch['atom_name'] == 'CA') ])

patches = [hydrophobics_patches, acid_patches]

#create the combination of protein patches interactions
x_vector = np.zeros(2)
patch_idx = 0
for patch in patches:
sum_interactions = 0
comb_ = itertools.combinations(patch, 2)
for c_ in list(comb_):
#function to calculate the distance-cutoff between CAs of two protein patches:
coors_0 = (c_[0][["x_coord", "y_coord", "z_coord"]]).to_numpy(dtype=float)
coors_1 = (c_[1][["x_coord", "y_coord", "z_coord"]]).to_numpy(dtype=float)
product_coors = np.array(list(itertools.product(coors_0, coors_1)))
# if pool:
# euclid_dists = pool.map(data_multi_processor.f_euc_mp, product_coors)
# euclid_dists = np.array(list(euclid_dists))
# paramlist = list(itertools.product(euclid_dists, [cutoff]))
# heavisides = pool.map(data_multi_processor.f_heaviside_mp, paramlist)
# heavisides = np.array(list(heavisides))
# else:
euclid_dists = np.array(list(map(f_euc_mp, product_coors)))
paramlist = list(itertools.product(euclid_dists, [cutoff]))
heavisides = np.array(list(map(f_heaviside_mp, paramlist)))
sum_interactions += np.sum(heavisides)
x_vector[patch_idx] = sum_interactions
patch_idx+=1
return {'id':id_name, 'h_a_vector':x_vector}

def x_processor(chains, id_name, atom_types, cutoff):
x_dict = x_atom_dist(chains, id_name, atom_types, cutoff)
ha_dict = x_hydrophobic_acid(chains, id_name, cutoff)
x_dict['x_vector'] = np.concatenate((x_dict['x_vector'], ha_dict["h_a_vector"]))
return x_dict

'''
multiprocessor feature functions
'''
def f_euc_mp(params):
return np.linalg.norm(params[0]-params[1])

def f_heaviside_mp(params):
return 1 if(params[0]<=params[1]) else 0


def f_proteins_interaction_mp(df_protein_A, df_protein_B, atom_types, cutoff, pool):
type_len = len(atom_types)
x_vector = np.zeros(type_len**2)
idx = 0
for a_type in atom_types:
for b_type in atom_types:
#calculate the interaction of each atoms:
sum_interaction = 0
a_atoms = df_protein_A.loc[df_protein_A['element_symbol'] == a_type].to_dict('records')
b_atoms = df_protein_B.loc[df_protein_B['element_symbol'] == b_type].to_dict('records')
a_coords = np.array([[a_atom['x_coord'], a_atom['y_coord'], a_atom['z_coord']] for a_atom in a_atoms], dtype=float)
b_coords = np.array([[b_atom['x_coord'], b_atom['y_coord'], b_atom['z_coord']] for b_atom in b_atoms], dtype=float)
paramlist = list(itertools.product(a_coords, b_coords))
euclid_dists = pool.map(f_euc_mp, paramlist)
euclid_dists = np.array(list(euclid_dists))
paramlist = list(itertools.product(euclid_dists, [cutoff]))
heavisides = pool.map(f_heaviside_mp, paramlist)
heavisides = np.array(list(heavisides))
sum_interaction = np.sum(heavisides)
x_vector[idx] = sum_interaction
idx+=1
return x_vector

def x_atom_dist_mp(params):
chains = params[0]
id_name = params[1]
atom_types = params[2]
cutoff = params[3]
pool = params[4]
#vector calculation:
x_vector = np.zeros(len(atom_types)**2)
length = len(chains)
for i in range(length):
for j in range(length):
if j>i:
#sum each chain interaction values:
print('protein chain :', i, j)
x_vector += f_proteins_interaction_mp(chains[i], chains[j], atom_types, cutoff, pool)
return {'id':id_name, 'x_vector':x_vector}

if __name__ == '__main__':
import pdb_processor as pdbp
def x_hydrophobic_acid_mp(params):
chains = params[0]
id_name = params[1]
cutoff = params[2]
pool = params[3]

print("processing ",id_name)


#hydrophobics and acids types of amino acids
hydrophobics = ['ALA','VAL','ILE','LEU','MET','PHE','TYR','TRP']
acids = ['ARG','HIS','LYS','ASP','GLU']

#select the carbon alpha of atoms based on the amino acid types
hydrophobics_patches = []
for i in range(len(chains)):
mol_patch=chains[i].set_index(['residue_name'])
hydrophobics_patches.append(mol_patch.loc[ (mol_patch.index.isin(hydrophobics)) & (mol_patch['atom_name'] == 'CA') ])

acid_patches = []
for i in range(len(chains)):
mol_patch=chains[i].set_index(['residue_name'])
acid_patches.append(mol_patch.loc[ (mol_patch.index.isin(acids)) & (mol_patch['atom_name'] == 'CA') ])

patches = [hydrophobics_patches, acid_patches]

#create the combination of protein patches interactions
x_vector = np.zeros(2)
patch_idx = 0
for patch in patches:
sum_interactions = 0
comb_ = itertools.combinations(patch, 2)
for c_ in list(comb_):
#function to calculate the distance-cutoff between CAs of two protein patches:
coors_0 = (c_[0][["x_coord", "y_coord", "z_coord"]]).to_numpy(dtype=float)
coors_1 = (c_[1][["x_coord", "y_coord", "z_coord"]]).to_numpy(dtype=float)
product_coors = np.array(list(itertools.product(coors_0, coors_1)))
euclid_dists = pool.map(f_euc_mp, product_coors)
euclid_dists = np.array(list(euclid_dists))
paramlist = list(itertools.product(euclid_dists, [cutoff]))
heavisides = pool.map(f_heaviside_mp, paramlist)
heavisides = np.array(list(heavisides))
# euclid_dists = np.array(list(map(f_euc_mp, product_coors)))
# paramlist = list(itertools.product(euclid_dists, [cutoff]))
# heavisides = np.array(list(map(f_heaviside_mp, paramlist)))
sum_interactions += np.sum(heavisides)
x_vector[patch_idx] = sum_interactions
patch_idx+=1
return {'id':id_name, 'h_a_vector':x_vector}

def x_processor_mp(params):
chains = params[0]
id_name = params[1]
atom_types = params[2]
cutoff = params[3]
pool = params[4]
x_dict = x_atom_dist([chains, id_name, atom_types, cutoff, pool])
ha_dict = x_hydrophobic_acid([chains, id_name, cutoff, pool])
x_dict['x_vector'] = np.concatenate((x_dict['x_vector'], ha_dict["h_a_vector"]))
return x_dict

0 comments on commit 6505a5f

Please sign in to comment.