FASTA & BED to FASTA file converter

This program converts FASTA genome sequences and BED queries into a single FASTA file for easy use and access. Output file is both printed and stored. Notable features: - Conversion of sequence to RNA - Parser style user input
SohanChoudhury · Aug 17, 2016 · 7456741 · 7456741
1 parent 00fd126
commit 7456741
Showing 1 changed file with 136 additions and 0 deletions.
diff --git a/FASTA_converter_1.13.py b/FASTA_converter_1.13.py
@@ -0,0 +1,136 @@
+# File: FASTA_converter_1.13.py
+
+
+## This function, true to it's name, performs the main processes of the program. It reads the BED input file, while writing the output FASTA file.
+
+def Main_process(BEDin_in, FASTAin_in, RnaDNA_in, OUTPUTin): 
+ BEDin = BEDin_in
+ FASTAin = FASTAin_in
+ OUTPUTraw3 = RnaDNA_in
+
+
+ BED = open(BEDin, "r")
+ OUTPUT = open(OUTPUTin, "w")
+
+ line = BED.readline()
+
+ words = line.split()
+
+
+ if words[0][0] == '#':
+ line = BED.readline()
+ else:
+ A = 0
+
+
+ previous_word = 'NA'
+ Match = 0
+ Match1st = 0
+ SignMinus = "-"
+
+ while line:
+ words = line.split()
+
+ print( ">" + words[0] + " - " + words[1] + " - " + words[2] + " - " + words[3])
+ OUTPUT.write( ">" + words[0] + " - " + words[1] + " - " + words[2] + " - " + words[3] + '\n')
+ current_word = words[0]
+ start = int(words[1])
+ end = int(words[2])
+ more_info = words[3]
+ signout = words[4]
+
+
+ if previous_word == current_word:
+ output_print( line2[start:end], signout, SignMinus, OUTPUTraw3,OUTPUT)
+
+ else:
+ FASTA = open(FASTAin, "r")
+ line2 = FASTA.readline()
+
+ while line2:
+ if words[0] in line2 :
+ line2 = FASTA.readline()
+ words2 = line2.split()
+
+ output_print( line2[start:end], signout, SignMinus,OUTPUTraw3,OUTPUT)
+
+ FASTA.close()
+ Match = 1
+ break
+ else:
+ line2 = FASTA.readline()
+ words2 = line2.split()
+
+
+
+ previous_word = current_word 
+ line = BED.readline()
+
+ print(' ')
+
+
+ print("Output saved.")
+ print(OUTPUTin)
+ print(' ')
+
+ print( "© Sohan Choudhury")
+ print( "Department of Hematology | Yale University")
+
+ BED.close()
+ OUTPUT.close()
+
+ print(' ')
+
+
+
+## This function does two things. If the BED input line has a "-", it will find the complementary nucleotides of the corresponding sequence, then reverse it.
+ ## If the user asks for the output FASTA file to be RNA, the function will replace all the T's in the corresponding sequence with U's.
+
+def output_print(string_in, signout_in, SignMinus_in, OUTPUTraw3_in, OUTPUT): 
+ if signout_in == SignMinus_in:
+ dict = str.maketrans("ATGC", "TACG")
+ value = string_in
+ result = value.translate(dict)
+ if OUTPUTraw3_in == "y":
+ print( result[::-1].replace("T","U"))
+ OUTPUT.write(result[::-1].replace("T","U") + '\n')
+ else:
+ print( result[::-1])
+ OUTPUT.write(result[::-1] + '\n')
+ else:
+ if OUTPUTraw3_in == "y":
+ print( string_in.replace("T","U"))
+ OUTPUT.write( string_in.replace("T","U") + '\n')
+ else: 
+ print( string_in)
+ OUTPUT.write(string_in + '\n')
+
+
+
+## Below is the user interface, created with argparse. The first two arguments, the input BED and FASTA files, are required. The rest are optional.
+
+import os, argparse
+
+parser = argparse.ArgumentParser(description='This is a bioinformatics file format converter that utilizes Python 3. Input your BED file as well as a FASTA sequence to obtain the respective genome sequence in the FASTA format. Order does not matter. Only first two fields are required. Input format: programlocation -b BEDfilelocation -f Sequencelocation -r y/n -o Output')
+parser.add_argument("-b", required = True, type=argparse.FileType('r'), help="BED file *required*")
+parser.add_argument("-g", required = True, type=argparse.FileType('r'), help="Genome sequence file *required*")
+parser.add_argument("-r", type=str, help="RNA for output FASTA? (y/n)" , choices=['y', 'n'], nargs = '?', default = 'n')
+parser.add_argument("-o", type=argparse.FileType('w'), help="Name of output file", nargs = '?')
+
+parser.add_argument("-v", "--verbose", action="store_true",
+ help="increase output verbosity")
+args = parser.parse_args()
+
+if args.verbose:
+ print ("Arguments: ".format(args.g, args.b, args.r, args.o))
+
+else:
+
+ if args.o:
+ OutputFileName = args.o.name
+ else:
+
+ OutputFileName = args.b.name.replace("bed", "fa")
+
+
+ Main_process( args.b.name, args.g.name, args.r, OutputFileName)