Skip to content

Commit

Permalink
fixed broken data coordinate from PDB
Browse files Browse the repository at this point in the history
  • Loading branch information
berylgithub committed Dec 2, 2019
1 parent fd9b455 commit 823c883
Showing 1 changed file with 54 additions and 9 deletions.
63 changes: 54 additions & 9 deletions data_multi_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,14 +88,24 @@ def data_processing(path,id_name, atom_types, cutoff):
l =[]
with open(path_file, 'r') as f:
for line in f:
if line.startswith('ATOM') or line.startswith('TER'):
if line.startswith('ATOM'):
clean_line = (line.rstrip()).split()
#check for alignment mistakes within data, a row with spacing alignment error has 11 length after splitted by whitespace
if len(clean_line) == 11:
#split the 2nd last column by the 4th index (this inference is according to PDB file formatting)
split = [clean_line[-2][:4], clean_line[-2][4:]]
clean_line[-2] = split[1]
clean_line.insert(-2, split[0])
#check if coordinate data collumns are collided (most likely happens between x and y coor)
if len(clean_line[6])>=13:
split = [clean_line[6][:-8], clean_line[6][-8:]]
last_elem = clean_line.pop()
clean_line[-1] = last_elem
clean_line.insert(6, split[0])
clean_line[7] = split[1]
l.append(clean_line)
elif line.startswith('TER'):
clean_line = (line.rstrip()).split()
l.append(clean_line)
elif line.startswith('ENDMDL'):
break
Expand Down Expand Up @@ -160,14 +170,24 @@ def data_multi_processing(path,id_name, atom_types, cutoff, pool):
l =[]
with open(path_file, 'r') as f:
for line in f:
if line.startswith('ATOM') or line.startswith('TER'):
if line.startswith('ATOM'):
clean_line = (line.rstrip()).split()
#check for alignment mistakes within data, a row with spacing alignment error has 11 length after splitted by whitespace
if len(clean_line) == 11:
#split the 2nd last column by the 4th index (this inference is according to PDB file formatting)
split = [clean_line[-2][:4], clean_line[-2][4:]]
clean_line[-2] = split[1]
clean_line.insert(-2, split[0])
#check if coordinate data collumns are collided (most likely happens between x and y coor)
if len(clean_line[6])>=13:
split = [clean_line[6][:-8], clean_line[6][-8:]]
last_elem = clean_line.pop()
clean_line[-1] = last_elem
clean_line.insert(6, split[0])
clean_line[7] = split[1]
l.append(clean_line)
elif line.startswith('TER'):
clean_line = (line.rstrip()).split()
l.append(clean_line)
elif line.startswith('ENDMDL'):
break
Expand Down Expand Up @@ -197,14 +217,24 @@ def data_multi_processing_mp(params):
l =[]
with open(path_file, 'r') as f:
for line in f:
if line.startswith('ATOM') or line.startswith('TER'):
if line.startswith('ATOM'):
clean_line = (line.rstrip()).split()
#check for alignment mistakes within data, a row with spacing alignment error has 11 length after splitted by whitespace
if len(clean_line) == 11:
#split the 2nd last column by the 4th index (this inference is according to PDB file formatting)
split = [clean_line[-2][:4], clean_line[-2][4:]]
clean_line[-2] = split[1]
clean_line.insert(-2, split[0])
#check if coordinate data collumns are collided (most likely happens between x and y coor)
if len(clean_line[6])>=13:
split = [clean_line[6][:-8], clean_line[6][-8:]]
last_elem = clean_line.pop()
clean_line[-1] = last_elem
clean_line.insert(6, split[0])
clean_line[7] = split[1]
l.append(clean_line)
elif line.startswith('TER'):
clean_line = (line.rstrip()).split()
l.append(clean_line)
elif line.startswith('ENDMDL'):
break
Expand Down Expand Up @@ -267,7 +297,7 @@ def unit_test_y_data():
# print(len(complex_files))
#
## test_file = path+'/'+complex_files[2]
# test_file = path+'/2wy2.ent.pdb'
# test_file = path+'/1f5r.ent.pdb'
# print(test_file)
#
# '''
Expand All @@ -276,20 +306,33 @@ def unit_test_y_data():
# l =[]
# with open(test_file, 'r') as f:
# for line in f:
# if line.startswith('ATOM') or line.startswith('TER'):
# if line.startswith('ATOM'):
# clean_line = (line.rstrip()).split()
# #check for alignment mistakes within data, a row with spacing alignment error has 11 length after splitted by whitespace
# if len(clean_line) == 11:
# #split the 2nd last column by the 4th index (this inference is according to PDB file formatting)
# split = [clean_line[-2][:4], clean_line[-2][4:]]
# clean_line[-2] = split[1]
# clean_line.insert(-2, split[0])
# #check if coordinate data collumns are collided (most likely happens between x and y coor)
# if len(clean_line[6])>=13:
# split = [clean_line[6][:-8], clean_line[6][-8:]]
# last_elem = clean_line.pop()
# clean_line[-1] = last_elem
# clean_line.insert(6, split[0])
# clean_line[7] = split[1]
# l.append(clean_line)
# elif line.startswith('TER'):
# clean_line = (line.rstrip()).split()
# l.append(clean_line)
# elif line.startswith('ENDMDL'):
# break
# df_atoms = (pd.DataFrame(l)).rename(columns={0:'record', 6:'x_coor', 7:'y_coor', 8:'z_coor', 11:'atom_type'})
#
#
# for i in range(len(l)):
# print(i, l[i])
#
# print(l[2013])
# print(len(l[293][6].split('-')))
# print(df_atoms)
#
# '''
Expand All @@ -303,7 +346,8 @@ def unit_test_y_data():
#
# print(df_atoms.index[df_atoms['record'] == 'TER'].tolist())
# print(l_df)

#
# print(df_atoms.iloc[293])
'''
multiprocessing unit test
'''
Expand Down Expand Up @@ -351,6 +395,7 @@ def unit_test_y_data():
#initialize parameters
path = 'C:/Users/beryl/Documents/Computational Science/Kanazawa/Thesis/Dataset/PP'
complex_files = [f for f in os.listdir(path) if os.path.isfile(os.path.join(path, f))]
#print(complex_files)

atom_types = ['C','N','O','F','P','S','Cl','Br','I']
cutoff = 12
Expand Down Expand Up @@ -415,5 +460,5 @@ def unit_test_y_data():
except FileNotFoundError:
print('File is not found')
saved_ids = [d['id'] for d in data]
print('processed protein IDs = ',saved_ids)
print('processed protein IDs = ',saved_ids, print(len(saved_ids)))

0 comments on commit 823c883

Please sign in to comment.