From a5d114a34473c349e5be60584d44c258d0e2eb75 Mon Sep 17 00:00:00 2001
From: Mohammed AlQuraishi <alquraishi@users.noreply.github.com>
Date: Wed, 24 Apr 2019 13:23:20 -0400
Subject: [PATCH] Create text_parser.py

---
 code/text_parser.py | 98 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 98 insertions(+)
 create mode 100644 code/text_parser.py

diff --git a/code/text_parser.py b/code/text_parser.py
new file mode 100644
index 0000000..539d03a
--- /dev/null
+++ b/code/text_parser.py
@@ -0,0 +1,98 @@
+"""
+Text-based parser for ProteinNet Records.
+"""
+
+__author__ = "Mohammed AlQuraishi"
+__copyright__ = "Copyright 2019, Harvard Medical School"
+__license__ = "MIT"
+
+#!/usr/bin/python
+
+# imports
+import sys
+import re
+
+# Constants
+NUM_DIMENSIONS = 3
+
+# Functions for conversion from Mathematica protein files to TFRecords
+_aa_dict = {'A': '0', 'C': '1', 'D': '2', 'E': '3', 'F': '4', 'G': '5', 'H': '6', 'I': '7', 'K': '8', 'L': '9', 'M': '10', 'N': '11', 'P': '12', 'Q': '13', 'R': '14', 'S': '15', 'T': '16', 'V': '17', 'W': '18', 'Y': '19'}
+_dssp_dict = {'L': '0', 'H': '1', 'B': '2', 'E': '3', 'G': '4', 'I': '5', 'T': '6', 'S': '7'}
+_mask_dict = {'-': '0', '+': '1'}
+
+class switch(object):
+    """Switch statement for Python, based on recipe from Python Cookbook."""
+
+    def __init__(self, value):
+        self.value = value
+        self.fall = False
+
+    def __iter__(self):
+        """Return the match method once, then stop"""
+        yield self.match
+        raise StopIteration
+    
+    def match(self, *args):
+        """Indicate whether or not to enter a case suite"""
+        if self.fall or not args:
+            return True
+        elif self.value in args: # changed for v1.5
+            self.fall = True
+            return True
+        else:
+            return False
+
+def letter_to_num(string, dict_):
+    """ Convert string of letters to list of ints """
+    patt = re.compile('[' + ''.join(dict_.keys()) + ']')
+    num_string = patt.sub(lambda m: dict_[m.group(0)] + ' ', string)
+    num = [int(i) for i in num_string.split()]
+    return num
+
+def read_record(file_, num_evo_entries):
+    """ Read a Mathematica protein record from file and convert into dict. """
+    
+    dict_ = {}
+
+    while True:
+        next_line = file_.readline()
+        for case in switch(next_line):
+            if case('[ID]' + '\n'):
+                id_ = file_.readline()[:-1]
+                dict_.update({'id': id_})
+            elif case('[PRIMARY]' + '\n'):
+                primary = letter_to_num(file_.readline()[:-1], _aa_dict)
+                dict_.update({'primary': primary})
+            elif case('[EVOLUTIONARY]' + '\n'):
+                evolutionary = []
+                for residue in range(num_evo_entries): evolutionary.append([float(step) for step in file_.readline().split()])
+                dict_.update({'evolutionary': evolutionary})
+            elif case('[SECONDARY]' + '\n'):
+                secondary = letter_to_num(file_.readline()[:-1], _dssp_dict)
+                dict_.update({'secondary': secondary})
+            elif case('[TERTIARY]' + '\n'):
+                tertiary = []
+                for axis in range(NUM_DIMENSIONS): tertiary.append([float(coord) for coord in file_.readline().split()])
+                dict_.update({'tertiary': tertiary})
+            elif case('[MASK]' + '\n'):
+                mask = letter_to_num(file_.readline()[:-1], _mask_dict)
+                dict_.update({'mask': mask})
+            elif case('\n'):
+                return dict_
+            elif case(''):
+                return None
+
+# main. accepts two command-line arguments: input file and the number of entries in evo profiles, and outputs dicts to stdout
+if __name__ == '__main__':
+    input_path      = sys.argv[1] 
+    num_evo_entries = int(sys.argv[2]) if len(sys.argv) == 3 else 20 # default number of evo entries
+
+    input_file = open(input_path, 'r')
+        
+    while True:
+        dict_ = read_record(input_file, num_evo_entries)
+        if dict_ is not None:
+            print dict_
+        else:
+            input_file.close()
+            break