-
Notifications
You must be signed in to change notification settings - Fork 4
/
sim_group_files.py
executable file
·63 lines (44 loc) · 1.77 KB
/
sim_group_files.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# Author: Xiaofei Zeng
# Email: [email protected]
# Created Time: 2022-07-24 17:05
import argparse
import collections
def parse_fasta(fasta_file):
group_ctg_dict = collections.defaultdict(list)
fa_dict = collections.defaultdict(list)
with open(fasta_file) as f:
for line in f:
if not line.strip():
continue
if line.startswith('>'):
ctg = line.split()[0][1:]
group = ctg.split('_')[0]
assert ctg not in group_ctg_dict[group]
group_ctg_dict[group].append(ctg)
else:
fa_dict[ctg].append(line.strip().upper())
for ctg, seq_list in fa_dict.items():
seq = ''.join(seq_list)
# Using fixed 'GATC' here is ok. Becuase in the sorting step,
# we don't acre about the restriction sites
RE_sites = seq.count('GATC')
fa_dict[ctg] = [seq, len(seq), RE_sites]
return fa_dict, group_ctg_dict
def generate_group_files(fa_dict, group_ctg_dict):
# sort by length
for group, ctgs in group_ctg_dict.items():
with open('group_{}.txt'.format(group), 'w') as fout:
fout.write('#Contig\tRECounts\tLength\n')
sorted_ctgs = sorted(ctgs, key=lambda x: fa_dict[x][1], reverse=True)
for ctg in sorted_ctgs:
fout.write('{}\t{}\t{}\n'.format(ctg, fa_dict[ctg][2], fa_dict[ctg][1]))
def main():
parser = argparse.ArgumentParser()
parser.add_argument('fasta', help='contig-level genome file in FASTA format')
args = parser.parse_args()
fa_dict, group_ctg_dict = parse_fasta(args.fasta)
generate_group_files(fa_dict, group_ctg_dict)
if __name__ == '__main__':
main()