Merge pull request #242 from waveygang/specify-queries

Specify queries
waveygang · May 28, 2024 · 517e1bc · 517e1bc
2 parents 251f4e1 + 76d19ca
commit 517e1bc
Show file tree

Hide file tree

Showing 9 changed files with 248 additions and 69 deletions.
diff --git a/.github/workflows/test_on_push.yml b/.github/workflows/test_on_push.yml
@@ -1,10 +1,10 @@
 on:
  push:
- branches: [ master ]
+ branches: [ main ]
  paths-ignore:
  - '**/*.md'
  pull_request:
- branches: [ master ]
+ branches: [ main ]
  paths-ignore:
  - '**/*.md'
 
@@ -38,9 +38,9 @@ jobs:
  - name: Test mapping coverage with 8 yeast genomes (PAF output)
  run: ASAN_OPTIONS=detect_leaks=1:symbolize=1 LSAN_OPTIONS=verbosity=0:log_threads=1 build/bin/wfmash data/scerevisiae8.fa.gz -p 95 -n 7 -m -L -Y '#' > scerevisiae8.paf; scripts/test.sh data/scerevisiae8.fa.gz.fai scerevisiae8.paf 0.92
  - name: Test mapping+alignment with a subset of the LPA dataset (PAF output)
- run: ASAN_OPTIONS=detect_leaks=1:symbolize=1 LSAN_OPTIONS=verbosity=0:log_threads=1 build/bin/wfmash data/LPA.subset.fa.gz -n 10 -T wflign_info. -u ./ -L --path-patching-tsv x.tsv > LPA.subset.paf && head LPA.subset.paf && head x.tsv
+ run: ASAN_OPTIONS=detect_leaks=1:symbolize=1 LSAN_OPTIONS=verbosity=0:log_threads=1 build/bin/wfmash data/LPA.subset.fa.gz -n 10 -L > LPA.subset.paf && head LPA.subset.paf
  - name: Test mapping+alignment with a subset of the LPA dataset (SAM output)
- run: ASAN_OPTIONS=detect_leaks=1:symbolize=1 LSAN_OPTIONS=verbosity=0:log_threads=1 build/bin/wfmash data/LPA.subset.fa.gz -N -a -T wflign_info. -L > LPA.subset.sam && samtools view LPA.subset.sam -bS | samtools sort > LPA.subset.bam && samtools index LPA.subset.bam && samtools view LPA.subset.bam | head | cut -f 1-9
+ run: ASAN_OPTIONS=detect_leaks=1:symbolize=1 LSAN_OPTIONS=verbosity=0:log_threads=1 build/bin/wfmash data/LPA.subset.fa.gz -N -a -L > LPA.subset.sam && samtools view LPA.subset.sam -bS | samtools sort > LPA.subset.bam && samtools index LPA.subset.bam && samtools view LPA.subset.bam | head | cut -f 1-9
  - name: Test mapping+alignment with short reads (500 bps) to a reference (SAM output)
  run: ASAN_OPTIONS=detect_leaks=1:symbolize=1 LSAN_OPTIONS=verbosity=0:log_threads=1 build/bin/wfmash data/reference.fa.gz data/reads.500bps.fa.gz -s 0.5k -N -a > reads.500bps.sam && samtools view reads.500bps.sam -bS | samtools sort > reads.500bps.bam && samtools index reads.500bps.bam && samtools view reads.500bps.bam | head
  - name: Test mapping+alignment with short reads (255bps) (PAF output)

diff --git a/scripts/all2all_jobs.py b/scripts/all2all_jobs.py
@@ -0,0 +1,90 @@
+import argparse
+import gzip
+import itertools
+
+def parse_fasta_index(fasta_file):
+ fai_file = fasta_file + '.fai'
+ sequences = []
+ with open(fai_file, 'r') as file:
+ for line in file:
+ sequence_name = line.strip().split('\t')[0]
+ sequences.append(sequence_name)
+ return sequences
+
+def group_sequences(sequences, grouping):
+ grouped_sequences = {}
+ for sequence in sequences:
+ if '#' in sequence:
+ fields = sequence.split('#')
+ if grouping in ['g', 'genome']:
+ group_key = fields[0]
+ elif grouping in ['h', 'haplotype']:
+ group_key = '#'.join(fields[:2])
+ elif grouping in ['c', 'contig']:
+ group_key = sequence
+ else:
+ raise ValueError(f"Invalid grouping: {grouping}")
+ else:
+ group_key = sequence
+
+ if group_key not in grouped_sequences:
+ grouped_sequences[group_key] = []
+ grouped_sequences[group_key].append(sequence)
+
+ return grouped_sequences
+
+def generate_pairings(target_grouped_sequences, query_grouped_sequences, num_queries):
+ pairings = []
+ target_groups = list(target_grouped_sequences.keys())
+ query_groups = list(query_grouped_sequences.keys())
+
+ for target_group in target_groups:
+ query_pool = [group for group in query_groups if group != target_group]
+
+ for query_chunk in itertools.zip_longest(*[iter(query_pool)] * num_queries):
+ query_chunk = [q for q in query_chunk if q is not None]
+ pairings.append((target_group, query_chunk))
+
+ return pairings
+
+def main():
+ parser = argparse.ArgumentParser(description='Generate pairings or wfmash command lines for all-to-all alignment using PanSN format.')
+ parser.add_argument('fasta_file', help='Path to the FASTA file (can be gzipped)')
+ parser.add_argument('-n', '--num-queries', type=int, default=4, help='Number of query groups per target group (default: 4)')
+ parser.add_argument('-t', '--target-grouping', choices=['g', 'genome', 'h', 'haplotype', 'c', 'contig'], default='haplotype', help='Grouping level for targets: g/genome, h/haplotype, or c/contig (default: haplotype)')
+ parser.add_argument('-q', '--query-grouping', choices=['g', 'genome', 'h', 'haplotype', 'c', 'contig'], default='haplotype', help='Grouping level for queries: g/genome, h/haplotype, or c/contig (default: haplotype)')
+ parser.add_argument('-o', '--output', help='Output file to save the pairings or command lines')
+
+ args, wfmash_args = parser.parse_known_args()
+
+ # Parse the FASTA index file
+ sequences = parse_fasta_index(args.fasta_file)
+
+ # Group sequences based on the specified grouping levels
+ target_grouped_sequences = group_sequences(sequences, args.target_grouping)
+ query_grouped_sequences = group_sequences(sequences, args.query_grouping)
+
+ # Generate pairings
+ pairings = generate_pairings(target_grouped_sequences, query_grouped_sequences, args.num_queries)
+
+ # Save or print the pairings or command lines
+ if wfmash_args:
+ wfmash_options = ' '.join(wfmash_args)
+ if args.output:
+ with open(args.output, 'w') as file:
+ for target_group, query_groups in pairings:
+ file.write(f"wfmash {wfmash_options} -T {target_group} -Q {','.join(query_groups)}\n")
+ else:
+ for target_group, query_groups in pairings:
+ print(f"wfmash {wfmash_options} -T {target_group} -Q {','.join(query_groups)}")
+ else:
+ if args.output:
+ with open(args.output, 'w') as file:
+ for target_group, query_groups in pairings:
+ file.write(f"{target_group}\t{','.join(query_groups)}\n")
+ else:
+ for target_group, query_groups in pairings:
+ print(f"{target_group}\t{','.join(query_groups)}")
+
+if __name__ == '__main__':
+ main()
diff --git a/scripts/make_source_targball.sh b/scripts/make_source_targball.sh
@@ -0,0 +1,20 @@
+#!/bin/bash
+
+# run from root of the repository
+
+mkdir source-tarball
+cd source-tarball
+git clone --recursive https://github.com/waveygang/wfmash
+cd wfmash
+git fetch --tags origin
+LATEST_TAG="$(git describe --tags `git rev-list --tags --max-count=1`)"
+git checkout "${LATEST_TAG}"
+git submodule update --init --recursive
+bash scripts/generate_git_version.sh $PWD/src
+sed 's/execute_process(COMMAND bash/#execute_process(COMMAND bash/g' CMakeLists.txt -i
+rm -Rf .git
+find src/common -name ".git" -exec rm -Rf "{}" \;
+cd ..
+mv wfmash "wfmash-${LATEST_TAG}"
+tar -czf "wfmash-${LATEST_TAG}.tar.gz" "wfmash-${LATEST_TAG}"
+rm -Rf "wfmash-${LATEST_TAG}"
diff --git a/src/common/seqiter.hpp b/src/common/seqiter.hpp
@@ -106,4 +106,57 @@ void for_each_seq_in_file(
  }
 }
 
+void for_each_seq_in_file(
+ faidx_t* fai,
+ const std::vector<std::string>& seq_names,
+ const std::function<void(const std::string&, const std::string&)>& func) {
+ for (const auto& seq_name : seq_names) {
+ int len;
+ char* seq = fai_fetch(fai, seq_name.c_str(), &len);
+ if (seq != nullptr) {
+ func(seq_name, std::string(seq));
+ free(seq);
+ }
+ }
+}
+
+void for_each_seq_in_file_filtered(
+ const std::string& filename,
+ const std::vector<std::string>& query_prefix,
+ const std::unordered_set<std::string>& query_list,
+ const std::function<void(const std::string&, const std::string&)>& func) {
+ faidx_t* fai = fai_load(filename.c_str());
+ if (fai == nullptr) {
+ std::cerr << "Error: Failed to load FASTA index for file " << filename << std::endl;
+ return;
+ }
+
+ std::vector<std::string> query_seq_names;
+ int num_seqs = faidx_nseq(fai);
+ for (int i = 0; i < num_seqs; i++) {
+ const char* seq_name = faidx_iseq(fai, i);
+ bool prefix_skip = true;
+ for (const auto& prefix : query_prefix) {
+ if (strncmp(seq_name, prefix.c_str(), prefix.size()) == 0) {
+ prefix_skip = false;
+ break;
+ }
+ }
+ if (!query_prefix.empty() && prefix_skip) {
+ continue;
+ }
+ if (!query_list.empty() && query_list.count(seq_name) == 0) {
+ continue;
+ }
+ query_seq_names.push_back(seq_name);
+ }
+
+ for_each_seq_in_file(
+ fai,
+ query_seq_names,
+ func);
+
+ fai_destroy(fai);
+}
+
 } // namespace seqiter
diff --git a/src/common/wflign/src/wflign.cpp b/src/common/wflign/src/wflign.cpp
@@ -695,6 +695,7 @@ void WFlign::wflign_affine_wavefront(
 
  // Free
  delete wflambda_aligner;
+ delete wf_aligner;
 
 #ifdef WFA_PNG_TSV_TIMING
  if (extend_data.emit_png) {
@@ -972,9 +973,6 @@ void WFlign::wflign_affine_wavefront(
  }
 
  if (merge_alignments) {
- // Free old aligner
- delete wf_aligner;
-
  // use biWFA for all patching
  wfa::WFAlignerGapAffine2Pieces* wf_aligner =
  new wfa::WFAlignerGapAffine2Pieces(
@@ -1029,7 +1027,9 @@ void WFlign::wflign_affine_wavefront(
  emit_patching_tsv,
  out_patching_tsv
 #endif
- );
+ );
+
+ delete wf_aligner;
  } else {
  // todo old implementation (and SAM format is not supported)
  for (auto x = trace.rbegin(); x != trace.rend(); ++x) {
@@ -1050,9 +1050,6 @@ void WFlign::wflign_affine_wavefront(
  }
  }
  }
-
- // Free
- delete wf_aligner;
  }
 }
 

diff --git a/src/interface/parse_args.hpp b/src/interface/parse_args.hpp
@@ -64,9 +64,9 @@ void parse_args(int argc,
  args::Group mandatory_opts(parser, "[ MANDATORY OPTIONS ]");
  args::Positional<std::string> target_sequence_file(mandatory_opts, "target", "alignment target/reference sequence file");
 
- args::Group io_opts(parser, "[ Files IO Options ]");
- args::PositionalList<std::string> query_sequence_files(io_opts, "queries", "query sequences file");
- args::ValueFlag<std::string> query_sequence_file_list(io_opts, "queries", "alignment queries files list", {'Q', "query-file-list"});
+ args::Group io_opts(parser, "[ Files IO Options ]");
+ args::PositionalList<std::string> query_sequence_files(io_opts, "queries", "query sequence file(s)");
+ //args::ValueFlag<std::string> query_sequence_file_list(io_opts, "queries", "alignment queries files list", {'Q', "query-file-list"});
 
  args::Group mapping_opts(parser, "[ Mapping Options ]");
  args::ValueFlag<float> map_pct_identity(mapping_opts, "%", "percent identity in the mashmap step [default: 90]", {'p', "map-pct-id"});
@@ -80,8 +80,10 @@ void parse_args(int argc,
  args::Flag skip_self(mapping_opts, "", "skip self mappings when the query and target name is the same (for all-vs-all mode)", {'X', "skip-self"});
  args::Flag one_to_one(mapping_opts, "", "Perform one-to-one filtering", {'4', "one-to-one"});
  args::ValueFlag<char> skip_prefix(mapping_opts, "C", "skip mappings when the query and target have the same prefix before the last occurrence of the given character C", {'Y', "skip-prefix"});
- args::ValueFlag<std::string> target_prefix(mapping_opts, "pfx", "use only targets whose name starts with this prefix", {'P', "target-prefix"});
- args::ValueFlag<std::string> target_list(mapping_opts, "FILE", "file containing list of target sequence names to use", {'A', "target-list"});
+ args::ValueFlag<std::string> target_prefix(mapping_opts, "pfx", "use only targets whose names start with this prefix", {'T', "target-prefix"});
+ args::ValueFlag<std::string> target_list(mapping_opts, "FILE", "file containing list of target sequence names to use", {'R', "target-list"});
+ args::ValueFlag<std::string> query_prefix(mapping_opts, "pfx[,pfx,...]", "use only queries whose names start with these prefixes (comma delimited)", {'Q', "query-prefix"});
+ args::ValueFlag<std::string> query_list(mapping_opts, "FILE", "file containing list of query sequence names", {'A', "query-list"});
  args::Flag approx_mapping(mapping_opts, "approx-map", "skip base-level alignment, producing an approximate mapping in PAF", {'m',"approx-map"});
  args::Flag no_split(mapping_opts, "no-split", "disable splitting of input sequences during mapping [default: enabled]", {'N',"no-split"});
  args::ValueFlag<std::string> chain_gap(mapping_opts, "N", "chain mappings closer than this distance in query and target, sets approximate maximum variant length detectable in alignment [default: 4*segment_length, up to 20k]", {'c', "chain-gap"});
@@ -102,19 +104,19 @@ void parse_args(int argc,
  args::Group alignment_opts(parser, "[ Alignment Options ]");
  args::ValueFlag<std::string> align_input_paf(alignment_opts, "FILE", "derive precise alignments for this input PAF", {'i', "input-paf"});
  args::Flag invert_filtering(alignment_opts, "A", "if an input PAF is specified, remove alignments with gap-compressed identity below --map-pct-id x 0.8, else keep all alignments "
- "[default: if an input PAF is specified, keep all alignments, else remove alignments with gap-compressed identity below --map-pct-id x 0.8]",
+ "[default: if an input PAF is specified, keep all alignments, else remove alignments with gap-compressed identity below --map-pct-id x 0.8]",
  {'O', "invert-filtering"});
  args::ValueFlag<uint16_t> wflambda_segment_length(alignment_opts, "N", "wflambda segment length: size (in bp) of segment mapped in hierarchical WFA problem [default: 256]", {'W', "wflamda-segment"});
  args::ValueFlag<std::string> wfa_score_params(alignment_opts, "mismatch,gap1,ext1",
-  "score parameters for the wfa alignment (affine); match score is fixed at 0 [default: 6,8,1]",
-  {"wfa-params"});
+  "score parameters for the wfa alignment (affine); match score is fixed at 0 [default: 6,8,1]",
+  {"wfa-params"});
  args::ValueFlag<std::string> wfa_patching_score_params(alignment_opts, "mismatch,gap1,ext1,gap2,ext2",
-  "score parameters for the wfa patching alignment (convex); match score is fixed at 0 [default: 5,8,2,49,1]",
-  {"wfa-patching-params"});
+  "score parameters for the wfa patching alignment (convex); match score is fixed at 0 [default: 5,8,2,49,1]",
+  {"wfa-patching-params"});
  //wflign parameters
  args::ValueFlag<std::string> wflign_score_params(alignment_opts, "mismatch,gap1,ext1",
-  "score parameters for the wflign alignment (affine); match score is fixed at 0 [default: 4,6,1]",
-  {"wflign-params"});
+  "score parameters for the wflign alignment (affine); match score is fixed at 0 [default: 4,6,1]",
+  {"wflign-params"});
  args::ValueFlag<float> wflign_max_mash_dist(alignment_opts, "N", "maximum mash distance to perform the alignment in a wflambda segment [default: adaptive with respect to the estimated identity]", {'b', "max-mash-dist"});
  args::ValueFlag<int> wflign_min_wavefront_length(alignment_opts, "N", "min wavefront length for heuristic WFlign [default: 1024]", {'j', "wflign-min-wf-len"});
  args::ValueFlag<int> wflign_max_distance_threshold(alignment_opts, "N", "max distance threshold for heuristic WFlign [default: 2048/(estimated_identity^2)]", {'q', "wflign-max-distance"});
@@ -139,7 +141,7 @@ void parse_args(int argc,
 
 #ifdef WFA_PNG_TSV_TIMING
  args::Group debugging_opts(parser, "[ Debugging Options ]");
- args::ValueFlag<std::string> prefix_wavefront_info_in_tsv(parser, "PREFIX", " write wavefronts' information for each alignment in TSV format files with this PREFIX", {'T', "tsv"});
+ args::ValueFlag<std::string> prefix_wavefront_info_in_tsv(parser, "PREFIX", " write wavefronts' information for each alignment in TSV format files with this PREFIX", {'G', "tsv"});
  args::ValueFlag<std::string> prefix_wavefront_plot_in_png(parser, "PREFIX", " write wavefronts' plot for each alignment in PNG format files with this PREFIX", {'u', "prefix-png"});
  args::ValueFlag<uint64_t> wfplot_max_size(parser, "N", "max size of the wfplot [default: 1500]", {'z', "wfplot-max-size"});
  args::ValueFlag<std::string> path_patching_info_in_tsv(parser, "FILE", " write patching information for each alignment in TSV format in FILE", {"path-patching-tsv"});
@@ -202,6 +204,14 @@ void parse_args(int argc,
  if (target_prefix) {
  map_parameters.target_prefix = args::get(target_prefix);
  }
+
+ if (query_list) {
+ map_parameters.query_list = args::get(query_list);
+ }
+
+ if (query_prefix) {
+ map_parameters.query_prefix = skch::CommonFunc::split(args::get(query_prefix), ',');
+ }
 
  if (target_sequence_file) {
  map_parameters.refSequences.push_back(args::get(target_sequence_file));
@@ -215,10 +225,6 @@ void parse_args(int argc,
  align_parameters.querySequences.push_back(q);
  }
  }
- if (query_sequence_file_list) {
- skch::parseFileList(args::get(query_sequence_file_list), map_parameters.querySequences);
- skch::parseFileList(args::get(query_sequence_file_list), align_parameters.querySequences);
- }
 
  if (target_sequence_file && map_parameters.querySequences.empty()
  && map_parameters.refSequences.size() == 1