-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
This program converts FASTA genome sequences and BED queries into a single FASTA file for easy use and access. Output file is both printed and stored. Notable features: - Conversion of sequence to RNA - Parser style user input
- Loading branch information
1 parent
00fd126
commit 7456741
Showing
1 changed file
with
136 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,136 @@ | ||
# File: FASTA_converter_1.13.py | ||
|
||
|
||
## This function, true to it's name, performs the main processes of the program. It reads the BED input file, while writing the output FASTA file. | ||
|
||
def Main_process(BEDin_in, FASTAin_in, RnaDNA_in, OUTPUTin): | ||
BEDin = BEDin_in | ||
FASTAin = FASTAin_in | ||
OUTPUTraw3 = RnaDNA_in | ||
|
||
|
||
BED = open(BEDin, "r") | ||
OUTPUT = open(OUTPUTin, "w") | ||
|
||
line = BED.readline() | ||
|
||
words = line.split() | ||
|
||
|
||
if words[0][0] == '#': | ||
line = BED.readline() | ||
else: | ||
A = 0 | ||
|
||
|
||
previous_word = 'NA' | ||
Match = 0 | ||
Match1st = 0 | ||
SignMinus = "-" | ||
|
||
while line: | ||
words = line.split() | ||
|
||
print( ">" + words[0] + " - " + words[1] + " - " + words[2] + " - " + words[3]) | ||
OUTPUT.write( ">" + words[0] + " - " + words[1] + " - " + words[2] + " - " + words[3] + '\n') | ||
current_word = words[0] | ||
start = int(words[1]) | ||
end = int(words[2]) | ||
more_info = words[3] | ||
signout = words[4] | ||
|
||
|
||
if previous_word == current_word: | ||
output_print( line2[start:end], signout, SignMinus, OUTPUTraw3,OUTPUT) | ||
|
||
else: | ||
FASTA = open(FASTAin, "r") | ||
line2 = FASTA.readline() | ||
|
||
while line2: | ||
if words[0] in line2 : | ||
line2 = FASTA.readline() | ||
words2 = line2.split() | ||
|
||
output_print( line2[start:end], signout, SignMinus,OUTPUTraw3,OUTPUT) | ||
|
||
FASTA.close() | ||
Match = 1 | ||
break | ||
else: | ||
line2 = FASTA.readline() | ||
words2 = line2.split() | ||
|
||
|
||
|
||
previous_word = current_word | ||
line = BED.readline() | ||
|
||
print(' ') | ||
|
||
|
||
print("Output saved.") | ||
print(OUTPUTin) | ||
print(' ') | ||
|
||
print( "© Sohan Choudhury") | ||
print( "Department of Hematology | Yale University") | ||
|
||
BED.close() | ||
OUTPUT.close() | ||
|
||
print(' ') | ||
|
||
|
||
|
||
## This function does two things. If the BED input line has a "-", it will find the complementary nucleotides of the corresponding sequence, then reverse it. | ||
## If the user asks for the output FASTA file to be RNA, the function will replace all the T's in the corresponding sequence with U's. | ||
|
||
def output_print(string_in, signout_in, SignMinus_in, OUTPUTraw3_in, OUTPUT): | ||
if signout_in == SignMinus_in: | ||
dict = str.maketrans("ATGC", "TACG") | ||
value = string_in | ||
result = value.translate(dict) | ||
if OUTPUTraw3_in == "y": | ||
print( result[::-1].replace("T","U")) | ||
OUTPUT.write(result[::-1].replace("T","U") + '\n') | ||
else: | ||
print( result[::-1]) | ||
OUTPUT.write(result[::-1] + '\n') | ||
else: | ||
if OUTPUTraw3_in == "y": | ||
print( string_in.replace("T","U")) | ||
OUTPUT.write( string_in.replace("T","U") + '\n') | ||
else: | ||
print( string_in) | ||
OUTPUT.write(string_in + '\n') | ||
|
||
|
||
|
||
## Below is the user interface, created with argparse. The first two arguments, the input BED and FASTA files, are required. The rest are optional. | ||
|
||
import os, argparse | ||
|
||
parser = argparse.ArgumentParser(description='This is a bioinformatics file format converter that utilizes Python 3. Input your BED file as well as a FASTA sequence to obtain the respective genome sequence in the FASTA format. Order does not matter. Only first two fields are required. Input format: programlocation -b BEDfilelocation -f Sequencelocation -r y/n -o Output') | ||
parser.add_argument("-b", required = True, type=argparse.FileType('r'), help="BED file *required*") | ||
parser.add_argument("-g", required = True, type=argparse.FileType('r'), help="Genome sequence file *required*") | ||
parser.add_argument("-r", type=str, help="RNA for output FASTA? (y/n)" , choices=['y', 'n'], nargs = '?', default = 'n') | ||
parser.add_argument("-o", type=argparse.FileType('w'), help="Name of output file", nargs = '?') | ||
|
||
parser.add_argument("-v", "--verbose", action="store_true", | ||
help="increase output verbosity") | ||
args = parser.parse_args() | ||
|
||
if args.verbose: | ||
print ("Arguments: ".format(args.g, args.b, args.r, args.o)) | ||
|
||
else: | ||
|
||
if args.o: | ||
OutputFileName = args.o.name | ||
else: | ||
|
||
OutputFileName = args.b.name.replace("bed", "fa") | ||
|
||
|
||
Main_process( args.b.name, args.g.name, args.r, OutputFileName) |