enable handling gzip input in call_freq script

PengNi · Mar 10, 2022 · 1c73d4e · 1c73d4e
1 parent c8d5177
commit 1c73d4e
Show file tree

Hide file tree

Showing 6 changed files with 101 additions and 55 deletions.
diff --git a/README.md b/README.md
@@ -164,7 +164,7 @@ For the example data:
 # 1. run multi_to_single_fast5 if needed
 multi_to_single_fast5 -i $multi_read_fast5_dir -s $single_read_fast5_dir -t 30 --recursive
 
-# 2. basecall using GPU, fast5s is the $single_read_fast5_dir
+# 2. basecall using GPU, fast5s/ is the $single_read_fast5_dir
 guppy_basecaller -i fast5s/ -r -s fast5s_guppy \
  --config dna_r9.4.1_450bps_hac_prom.cfg \
  --device CUDA:0
@@ -301,6 +301,9 @@ deepsignal_plant train --train_file /path/to/train/file \
  --model_dir /dir/to/save/the/new/model
 ```
 
+Extra
+=====
+We are testing deepsignal-plant on a zebrafish sample...
 
 License
 =========

diff --git a/deepsignal_plant/call_modifications.py b/deepsignal_plant/call_modifications.py
@@ -695,7 +695,7 @@ def main():
  required=True,
  help="the input path, can be a signal_feature file from extract_features.py, "
  "or a directory of fast5 files. If a directory of fast5 files is provided, "
- "args in FAST5_EXTRACTION should (reference_path must) be provided.")
+ "args in FAST5_EXTRACTION should be provided.")
  p_input.add_argument("--f5_batch_size", action="store", type=int, default=30,
  required=False,
  help="number of reads/files to be processed by each process one time, default 30")

diff --git a/deepsignal_plant/call_mods_freq.py b/deepsignal_plant/call_mods_freq.py
@@ -8,6 +8,7 @@
 import argparse
 import os
 import sys
+import gzip
 import time
 
 from .utils.txt_formater import ModRecord
@@ -38,27 +39,31 @@ def calculate_mods_frequency(mods_files, prob_cf, contig_name=None):
 
  count, used = 0, 0
  for mods_file in mods_files:
- with open(mods_file, 'r') as rf:
- for line in rf:
- words = line.strip().split("\t")
- mod_record = ModRecord(words)
- if contig_name is not None and mod_record._chromosome != contig_name:
- continue
- if mod_record.is_record_callable(prob_cf):
- if mod_record._site_key not in sitekeys:
- sitekeys.add(mod_record._site_key)
- sitekey2stats[mod_record._site_key] = SiteStats(mod_record._strand,
- mod_record._pos_in_strand,
- mod_record._kmer)
- sitekey2stats[mod_record._site_key]._prob_0 += mod_record._prob_0
- sitekey2stats[mod_record._site_key]._prob_1 += mod_record._prob_1
- sitekey2stats[mod_record._site_key]._coverage += 1
- if mod_record._called_label == 1:
- sitekey2stats[mod_record._site_key]._met += 1
- else:
- sitekey2stats[mod_record._site_key]._unmet += 1
- used += 1
- count += 1
+ if mods_file.endswith(".gz"):
+ infile = gzip.open(mods_file, 'rt')
+ else:
+ infile = open(mods_file, 'r')
+ for line in infile:
+ words = line.strip().split("\t")
+ mod_record = ModRecord(words)
+ if contig_name is not None and mod_record._chromosome != contig_name:
+ continue
+ if mod_record.is_record_callable(prob_cf):
+ if mod_record._site_key not in sitekeys:
+ sitekeys.add(mod_record._site_key)
+ sitekey2stats[mod_record._site_key] = SiteStats(mod_record._strand,
+ mod_record._pos_in_strand,
+ mod_record._kmer)
+ sitekey2stats[mod_record._site_key]._prob_0 += mod_record._prob_0
+ sitekey2stats[mod_record._site_key]._prob_1 += mod_record._prob_1
+ sitekey2stats[mod_record._site_key]._coverage += 1
+ if mod_record._called_label == 1:
+ sitekey2stats[mod_record._site_key]._met += 1
+ else:
+ sitekey2stats[mod_record._site_key]._unmet += 1
+ used += 1
+ count += 1
+ infile.close()
  if contig_name is None:
  print("{:.2f}% ({} of {}) calls used..".format(used/float(count) * 100, used, count))
  else:
@@ -111,6 +116,26 @@ def _read_file_lines(cfile):
  return rf.read().splitlines()
 
 
+def _get_contignams_from_genome_fasta(genomefa):
+ contigs = []
+ with open(genomefa, "r") as rf:
+ for line in rf:
+ if line.startswith(">"):
+ contigname = line.strip()[1:].split(' ')[0]
+ contigs.append(contigname)
+ return contigs
+
+
+def _is_file_a_genome_fasta(contigfile):
+ with open(contigfile, "r") as rf:
+ for line in rf:
+ if line.startswith("#"):
+ continue
+ elif line.startswith(">"):
+ return True
+ return False
+
+
 def _get_contigfile_name(wprefix, contig):
  return wprefix + "." + contig + ".txt"
 
@@ -121,12 +146,16 @@ def _split_file_by_contignames(mods_files, wprefix, contigs):
  for contig in contigs:
  wfs[contig] = open(_get_contigfile_name(wprefix, contig), "w")
  for input_file in mods_files:
- with open(input_file, "r") as rf:
- for line in rf:
- chrom = line.strip().split("\t")[0]
- if chrom not in contigs:
- continue
- wfs[chrom].write(line)
+ if input_file.endswith(".gz"):
+ infile = gzip.open(input_file, 'rt')
+ else:
+ infile = open(input_file, 'r')
+ for line in infile:
+ chrom = line.strip().split("\t")[0]
+ if chrom not in contigs:
+ continue
+ wfs[chrom].write(line)
+ infile.close()
  for contig in contigs:
  wfs[contig].flush()
  wfs[contig].close()
@@ -199,7 +228,12 @@ def call_mods_frequency_to_file(args):
  contigs = None
  if args.contigs is not None:
  if os.path.isfile(args.contigs):
- contigs = sorted(list(set(_read_file_lines(args.contigs))))
+ if args.contigs.endswith(".fa") or args.contigs.endswith(".fasta") or args.contigs.endswith(".fna"):
+ contigs = _get_contignams_from_genome_fasta(args.contigs)
+ elif _is_file_a_genome_fasta(args.contigs):
+ contigs = _get_contignams_from_genome_fasta(args.contigs)
+ else:
+ contigs = sorted(list(set(_read_file_lines(args.contigs))))
  else:
  contigs = sorted(list(set(args.contigs.strip().split(","))))
 
@@ -259,8 +293,10 @@ def main():
  help='the file path to save the result')
 
  parser.add_argument('--contigs', action="store", type=str, required=False, default=None,
- help="path of a file contains chromosome/contig names, one name each line; "
- "or a string contains multiple chromosome names splited by comma. "
+ help="a reference genome file (.fa/.fasta/.fna), used for extracting all "
+ "contig names for parallel; "
+ "or path of a file containing chromosome/contig names, one name each line; "
+ "or a string contains multiple chromosome names splited by comma."
  "default None, which means all chromosomes will be processed at one time. "
  "If not None, one chromosome will be processed by one subprocess.")
  parser.add_argument('--nproc', action="store", type=int, required=False, default=1,

diff --git a/deepsignal_plant/deepsignal_plant.py b/deepsignal_plant/deepsignal_plant.py
@@ -211,7 +211,7 @@ def main():
  required=True,
  help="the input path, can be a signal_feature file from extract_features.py, "
  "or a directory of fast5 files. If a directory of fast5 files is provided, "
- "args in FAST5_EXTRACTION should (reference_path must) be provided.")
+ "args in FAST5_EXTRACTION should be provided.")
  sc_input.add_argument("--f5_batch_size", action="store", type=int, default=30,
  required=False,
  help="number of reads/files to be processed by each process one time, default 30")
@@ -462,8 +462,10 @@ def main():
 
  scf_para = sub_call_freq.add_argument_group("PARALLEL")
  scf_para.add_argument('--contigs', action="store", type=str, required=False, default=None,
- help="path of a file contains chromosome/contig names, one name each line; "
- "or a string contains multiple chromosome names splited by comma. "
+ help="a reference genome file (.fa/.fasta/.fna), used for extracting all "
+ "contig names for parallel; "
+ "or path of a file containing chromosome/contig names, one name each line; "
+ "or a string contains multiple chromosome names splited by comma."
  "default None, which means all chromosomes will be processed at one time. "
  "If not None, one chromosome will be processed by one subprocess.")
  scf_para.add_argument('--nproc', action="store", type=int, required=False, default=1,

diff --git a/deepsignal_plant/extract_features.py b/deepsignal_plant/extract_features.py
@@ -518,10 +518,10 @@ def _extract_preprocess_(motifs, is_dna, reference_path, position_file, regionst
  print("parse the motifs string..")
  motif_seqs = get_motif_seqs(motifs, is_dna)
 
- print("read genome reference file..")
  if reference_path is None:
  chrom2len = None
  else:
+ print("read genome reference file..")
  chrom2len = get_contig2len(reference_path)
 
  positions = None

diff --git a/scripts/call_modification_frequency.py b/scripts/call_modification_frequency.py
@@ -6,6 +6,7 @@
 import argparse
 import os
 import sys
+import gzip
 
 from txt_formater import ModRecord
 from txt_formater import SiteStats
@@ -18,25 +19,29 @@ def calculate_mods_frequency(mods_files, prob_cf):
 
  count, used = 0, 0
  for mods_file in mods_files:
- with open(mods_file, 'r') as rf:
- for line in rf:
- words = line.strip().split("\t")
- mod_record = ModRecord(words)
- if mod_record.is_record_callable(prob_cf):
- if mod_record._site_key not in sitekeys:
- sitekeys.add(mod_record._site_key)
- sitekey2stats[mod_record._site_key] = SiteStats(mod_record._strand,
- mod_record._pos_in_strand,
- mod_record._kmer)
- sitekey2stats[mod_record._site_key]._prob_0 += mod_record._prob_0
- sitekey2stats[mod_record._site_key]._prob_1 += mod_record._prob_1
- sitekey2stats[mod_record._site_key]._coverage += 1
- if mod_record._called_label == 1:
- sitekey2stats[mod_record._site_key]._met += 1
- else:
- sitekey2stats[mod_record._site_key]._unmet += 1
- used += 1
- count += 1
+ if mods_file.endswith(".gz"):
+ infile = gzip.open(mods_file, 'rt')
+ else:
+ infile = open(mods_file, 'r')
+ for line in infile:
+ words = line.strip().split("\t")
+ mod_record = ModRecord(words)
+ if mod_record.is_record_callable(prob_cf):
+ if mod_record._site_key not in sitekeys:
+ sitekeys.add(mod_record._site_key)
+ sitekey2stats[mod_record._site_key] = SiteStats(mod_record._strand,
+ mod_record._pos_in_strand,
+ mod_record._kmer)
+ sitekey2stats[mod_record._site_key]._prob_0 += mod_record._prob_0
+ sitekey2stats[mod_record._site_key]._prob_1 += mod_record._prob_1
+ sitekey2stats[mod_record._site_key]._coverage += 1
+ if mod_record._called_label == 1:
+ sitekey2stats[mod_record._site_key]._met += 1
+ else:
+ sitekey2stats[mod_record._site_key]._unmet += 1
+ used += 1
+ count += 1
+ infile.close()
  print("{:.2f}% ({} of {}) calls used..".format(used/float(count) * 100, used, count))
  return sitekey2stats