Skip to content

Commit

Permalink
enable handling gzip input in call_freq script
Browse files Browse the repository at this point in the history
  • Loading branch information
PengNi committed Mar 10, 2022
1 parent c8d5177 commit 1c73d4e
Show file tree
Hide file tree
Showing 6 changed files with 101 additions and 55 deletions.
5 changes: 4 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -164,7 +164,7 @@ For the example data:
# 1. run multi_to_single_fast5 if needed
multi_to_single_fast5 -i $multi_read_fast5_dir -s $single_read_fast5_dir -t 30 --recursive

# 2. basecall using GPU, fast5s is the $single_read_fast5_dir
# 2. basecall using GPU, fast5s/ is the $single_read_fast5_dir
guppy_basecaller -i fast5s/ -r -s fast5s_guppy \
--config dna_r9.4.1_450bps_hac_prom.cfg \
--device CUDA:0
Expand Down Expand Up @@ -301,6 +301,9 @@ deepsignal_plant train --train_file /path/to/train/file \
--model_dir /dir/to/save/the/new/model
```

Extra
=====
We are testing deepsignal-plant on a zebrafish sample...

License
=========
Expand Down
2 changes: 1 addition & 1 deletion deepsignal_plant/call_modifications.py
Original file line number Diff line number Diff line change
Expand Up @@ -695,7 +695,7 @@ def main():
required=True,
help="the input path, can be a signal_feature file from extract_features.py, "
"or a directory of fast5 files. If a directory of fast5 files is provided, "
"args in FAST5_EXTRACTION should (reference_path must) be provided.")
"args in FAST5_EXTRACTION should be provided.")
p_input.add_argument("--f5_batch_size", action="store", type=int, default=30,
required=False,
help="number of reads/files to be processed by each process one time, default 30")
Expand Down
96 changes: 66 additions & 30 deletions deepsignal_plant/call_mods_freq.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
import argparse
import os
import sys
import gzip
import time

from .utils.txt_formater import ModRecord
Expand Down Expand Up @@ -38,27 +39,31 @@ def calculate_mods_frequency(mods_files, prob_cf, contig_name=None):

count, used = 0, 0
for mods_file in mods_files:
with open(mods_file, 'r') as rf:
for line in rf:
words = line.strip().split("\t")
mod_record = ModRecord(words)
if contig_name is not None and mod_record._chromosome != contig_name:
continue
if mod_record.is_record_callable(prob_cf):
if mod_record._site_key not in sitekeys:
sitekeys.add(mod_record._site_key)
sitekey2stats[mod_record._site_key] = SiteStats(mod_record._strand,
mod_record._pos_in_strand,
mod_record._kmer)
sitekey2stats[mod_record._site_key]._prob_0 += mod_record._prob_0
sitekey2stats[mod_record._site_key]._prob_1 += mod_record._prob_1
sitekey2stats[mod_record._site_key]._coverage += 1
if mod_record._called_label == 1:
sitekey2stats[mod_record._site_key]._met += 1
else:
sitekey2stats[mod_record._site_key]._unmet += 1
used += 1
count += 1
if mods_file.endswith(".gz"):
infile = gzip.open(mods_file, 'rt')
else:
infile = open(mods_file, 'r')
for line in infile:
words = line.strip().split("\t")
mod_record = ModRecord(words)
if contig_name is not None and mod_record._chromosome != contig_name:
continue
if mod_record.is_record_callable(prob_cf):
if mod_record._site_key not in sitekeys:
sitekeys.add(mod_record._site_key)
sitekey2stats[mod_record._site_key] = SiteStats(mod_record._strand,
mod_record._pos_in_strand,
mod_record._kmer)
sitekey2stats[mod_record._site_key]._prob_0 += mod_record._prob_0
sitekey2stats[mod_record._site_key]._prob_1 += mod_record._prob_1
sitekey2stats[mod_record._site_key]._coverage += 1
if mod_record._called_label == 1:
sitekey2stats[mod_record._site_key]._met += 1
else:
sitekey2stats[mod_record._site_key]._unmet += 1
used += 1
count += 1
infile.close()
if contig_name is None:
print("{:.2f}% ({} of {}) calls used..".format(used/float(count) * 100, used, count))
else:
Expand Down Expand Up @@ -111,6 +116,26 @@ def _read_file_lines(cfile):
return rf.read().splitlines()


def _get_contignams_from_genome_fasta(genomefa):
contigs = []
with open(genomefa, "r") as rf:
for line in rf:
if line.startswith(">"):
contigname = line.strip()[1:].split(' ')[0]
contigs.append(contigname)
return contigs


def _is_file_a_genome_fasta(contigfile):
with open(contigfile, "r") as rf:
for line in rf:
if line.startswith("#"):
continue
elif line.startswith(">"):
return True
return False


def _get_contigfile_name(wprefix, contig):
return wprefix + "." + contig + ".txt"

Expand All @@ -121,12 +146,16 @@ def _split_file_by_contignames(mods_files, wprefix, contigs):
for contig in contigs:
wfs[contig] = open(_get_contigfile_name(wprefix, contig), "w")
for input_file in mods_files:
with open(input_file, "r") as rf:
for line in rf:
chrom = line.strip().split("\t")[0]
if chrom not in contigs:
continue
wfs[chrom].write(line)
if input_file.endswith(".gz"):
infile = gzip.open(input_file, 'rt')
else:
infile = open(input_file, 'r')
for line in infile:
chrom = line.strip().split("\t")[0]
if chrom not in contigs:
continue
wfs[chrom].write(line)
infile.close()
for contig in contigs:
wfs[contig].flush()
wfs[contig].close()
Expand Down Expand Up @@ -199,7 +228,12 @@ def call_mods_frequency_to_file(args):
contigs = None
if args.contigs is not None:
if os.path.isfile(args.contigs):
contigs = sorted(list(set(_read_file_lines(args.contigs))))
if args.contigs.endswith(".fa") or args.contigs.endswith(".fasta") or args.contigs.endswith(".fna"):
contigs = _get_contignams_from_genome_fasta(args.contigs)
elif _is_file_a_genome_fasta(args.contigs):
contigs = _get_contignams_from_genome_fasta(args.contigs)
else:
contigs = sorted(list(set(_read_file_lines(args.contigs))))
else:
contigs = sorted(list(set(args.contigs.strip().split(","))))

Expand Down Expand Up @@ -259,8 +293,10 @@ def main():
help='the file path to save the result')

parser.add_argument('--contigs', action="store", type=str, required=False, default=None,
help="path of a file contains chromosome/contig names, one name each line; "
"or a string contains multiple chromosome names splited by comma. "
help="a reference genome file (.fa/.fasta/.fna), used for extracting all "
"contig names for parallel; "
"or path of a file containing chromosome/contig names, one name each line; "
"or a string contains multiple chromosome names splited by comma."
"default None, which means all chromosomes will be processed at one time. "
"If not None, one chromosome will be processed by one subprocess.")
parser.add_argument('--nproc', action="store", type=int, required=False, default=1,
Expand Down
8 changes: 5 additions & 3 deletions deepsignal_plant/deepsignal_plant.py
Original file line number Diff line number Diff line change
Expand Up @@ -211,7 +211,7 @@ def main():
required=True,
help="the input path, can be a signal_feature file from extract_features.py, "
"or a directory of fast5 files. If a directory of fast5 files is provided, "
"args in FAST5_EXTRACTION should (reference_path must) be provided.")
"args in FAST5_EXTRACTION should be provided.")
sc_input.add_argument("--f5_batch_size", action="store", type=int, default=30,
required=False,
help="number of reads/files to be processed by each process one time, default 30")
Expand Down Expand Up @@ -462,8 +462,10 @@ def main():

scf_para = sub_call_freq.add_argument_group("PARALLEL")
scf_para.add_argument('--contigs', action="store", type=str, required=False, default=None,
help="path of a file contains chromosome/contig names, one name each line; "
"or a string contains multiple chromosome names splited by comma. "
help="a reference genome file (.fa/.fasta/.fna), used for extracting all "
"contig names for parallel; "
"or path of a file containing chromosome/contig names, one name each line; "
"or a string contains multiple chromosome names splited by comma."
"default None, which means all chromosomes will be processed at one time. "
"If not None, one chromosome will be processed by one subprocess.")
scf_para.add_argument('--nproc', action="store", type=int, required=False, default=1,
Expand Down
2 changes: 1 addition & 1 deletion deepsignal_plant/extract_features.py
Original file line number Diff line number Diff line change
Expand Up @@ -518,10 +518,10 @@ def _extract_preprocess_(motifs, is_dna, reference_path, position_file, regionst
print("parse the motifs string..")
motif_seqs = get_motif_seqs(motifs, is_dna)

print("read genome reference file..")
if reference_path is None:
chrom2len = None
else:
print("read genome reference file..")
chrom2len = get_contig2len(reference_path)

positions = None
Expand Down
43 changes: 24 additions & 19 deletions scripts/call_modification_frequency.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import argparse
import os
import sys
import gzip

from txt_formater import ModRecord
from txt_formater import SiteStats
Expand All @@ -18,25 +19,29 @@ def calculate_mods_frequency(mods_files, prob_cf):

count, used = 0, 0
for mods_file in mods_files:
with open(mods_file, 'r') as rf:
for line in rf:
words = line.strip().split("\t")
mod_record = ModRecord(words)
if mod_record.is_record_callable(prob_cf):
if mod_record._site_key not in sitekeys:
sitekeys.add(mod_record._site_key)
sitekey2stats[mod_record._site_key] = SiteStats(mod_record._strand,
mod_record._pos_in_strand,
mod_record._kmer)
sitekey2stats[mod_record._site_key]._prob_0 += mod_record._prob_0
sitekey2stats[mod_record._site_key]._prob_1 += mod_record._prob_1
sitekey2stats[mod_record._site_key]._coverage += 1
if mod_record._called_label == 1:
sitekey2stats[mod_record._site_key]._met += 1
else:
sitekey2stats[mod_record._site_key]._unmet += 1
used += 1
count += 1
if mods_file.endswith(".gz"):
infile = gzip.open(mods_file, 'rt')
else:
infile = open(mods_file, 'r')
for line in infile:
words = line.strip().split("\t")
mod_record = ModRecord(words)
if mod_record.is_record_callable(prob_cf):
if mod_record._site_key not in sitekeys:
sitekeys.add(mod_record._site_key)
sitekey2stats[mod_record._site_key] = SiteStats(mod_record._strand,
mod_record._pos_in_strand,
mod_record._kmer)
sitekey2stats[mod_record._site_key]._prob_0 += mod_record._prob_0
sitekey2stats[mod_record._site_key]._prob_1 += mod_record._prob_1
sitekey2stats[mod_record._site_key]._coverage += 1
if mod_record._called_label == 1:
sitekey2stats[mod_record._site_key]._met += 1
else:
sitekey2stats[mod_record._site_key]._unmet += 1
used += 1
count += 1
infile.close()
print("{:.2f}% ({} of {}) calls used..".format(used/float(count) * 100, used, count))
return sitekey2stats

Expand Down

0 comments on commit 1c73d4e

Please sign in to comment.