Skip to content

Commit

Permalink
Merge pull request #242 from waveygang/specify-queries
Browse files Browse the repository at this point in the history
Specify queries
  • Loading branch information
ekg committed May 28, 2024
2 parents 251f4e1 + 76d19ca commit 517e1bc
Show file tree
Hide file tree
Showing 9 changed files with 248 additions and 69 deletions.
8 changes: 4 additions & 4 deletions .github/workflows/test_on_push.yml
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
on:
push:
branches: [ master ]
branches: [ main ]
paths-ignore:
- '**/*.md'
pull_request:
branches: [ master ]
branches: [ main ]
paths-ignore:
- '**/*.md'

Expand Down Expand Up @@ -38,9 +38,9 @@ jobs:
- name: Test mapping coverage with 8 yeast genomes (PAF output)
run: ASAN_OPTIONS=detect_leaks=1:symbolize=1 LSAN_OPTIONS=verbosity=0:log_threads=1 build/bin/wfmash data/scerevisiae8.fa.gz -p 95 -n 7 -m -L -Y '#' > scerevisiae8.paf; scripts/test.sh data/scerevisiae8.fa.gz.fai scerevisiae8.paf 0.92
- name: Test mapping+alignment with a subset of the LPA dataset (PAF output)
run: ASAN_OPTIONS=detect_leaks=1:symbolize=1 LSAN_OPTIONS=verbosity=0:log_threads=1 build/bin/wfmash data/LPA.subset.fa.gz -n 10 -T wflign_info. -u ./ -L --path-patching-tsv x.tsv > LPA.subset.paf && head LPA.subset.paf && head x.tsv
run: ASAN_OPTIONS=detect_leaks=1:symbolize=1 LSAN_OPTIONS=verbosity=0:log_threads=1 build/bin/wfmash data/LPA.subset.fa.gz -n 10 -L > LPA.subset.paf && head LPA.subset.paf
- name: Test mapping+alignment with a subset of the LPA dataset (SAM output)
run: ASAN_OPTIONS=detect_leaks=1:symbolize=1 LSAN_OPTIONS=verbosity=0:log_threads=1 build/bin/wfmash data/LPA.subset.fa.gz -N -a -T wflign_info. -L > LPA.subset.sam && samtools view LPA.subset.sam -bS | samtools sort > LPA.subset.bam && samtools index LPA.subset.bam && samtools view LPA.subset.bam | head | cut -f 1-9
run: ASAN_OPTIONS=detect_leaks=1:symbolize=1 LSAN_OPTIONS=verbosity=0:log_threads=1 build/bin/wfmash data/LPA.subset.fa.gz -N -a -L > LPA.subset.sam && samtools view LPA.subset.sam -bS | samtools sort > LPA.subset.bam && samtools index LPA.subset.bam && samtools view LPA.subset.bam | head | cut -f 1-9
- name: Test mapping+alignment with short reads (500 bps) to a reference (SAM output)
run: ASAN_OPTIONS=detect_leaks=1:symbolize=1 LSAN_OPTIONS=verbosity=0:log_threads=1 build/bin/wfmash data/reference.fa.gz data/reads.500bps.fa.gz -s 0.5k -N -a > reads.500bps.sam && samtools view reads.500bps.sam -bS | samtools sort > reads.500bps.bam && samtools index reads.500bps.bam && samtools view reads.500bps.bam | head
- name: Test mapping+alignment with short reads (255bps) (PAF output)
Expand Down
90 changes: 90 additions & 0 deletions scripts/all2all_jobs.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
import argparse
import gzip
import itertools

def parse_fasta_index(fasta_file):
fai_file = fasta_file + '.fai'
sequences = []
with open(fai_file, 'r') as file:
for line in file:
sequence_name = line.strip().split('\t')[0]
sequences.append(sequence_name)
return sequences

def group_sequences(sequences, grouping):
grouped_sequences = {}
for sequence in sequences:
if '#' in sequence:
fields = sequence.split('#')
if grouping in ['g', 'genome']:
group_key = fields[0]
elif grouping in ['h', 'haplotype']:
group_key = '#'.join(fields[:2])
elif grouping in ['c', 'contig']:
group_key = sequence
else:
raise ValueError(f"Invalid grouping: {grouping}")
else:
group_key = sequence

if group_key not in grouped_sequences:
grouped_sequences[group_key] = []
grouped_sequences[group_key].append(sequence)

return grouped_sequences

def generate_pairings(target_grouped_sequences, query_grouped_sequences, num_queries):
pairings = []
target_groups = list(target_grouped_sequences.keys())
query_groups = list(query_grouped_sequences.keys())

for target_group in target_groups:
query_pool = [group for group in query_groups if group != target_group]

for query_chunk in itertools.zip_longest(*[iter(query_pool)] * num_queries):
query_chunk = [q for q in query_chunk if q is not None]
pairings.append((target_group, query_chunk))

return pairings

def main():
parser = argparse.ArgumentParser(description='Generate pairings or wfmash command lines for all-to-all alignment using PanSN format.')
parser.add_argument('fasta_file', help='Path to the FASTA file (can be gzipped)')
parser.add_argument('-n', '--num-queries', type=int, default=4, help='Number of query groups per target group (default: 4)')
parser.add_argument('-t', '--target-grouping', choices=['g', 'genome', 'h', 'haplotype', 'c', 'contig'], default='haplotype', help='Grouping level for targets: g/genome, h/haplotype, or c/contig (default: haplotype)')
parser.add_argument('-q', '--query-grouping', choices=['g', 'genome', 'h', 'haplotype', 'c', 'contig'], default='haplotype', help='Grouping level for queries: g/genome, h/haplotype, or c/contig (default: haplotype)')
parser.add_argument('-o', '--output', help='Output file to save the pairings or command lines')

args, wfmash_args = parser.parse_known_args()

# Parse the FASTA index file
sequences = parse_fasta_index(args.fasta_file)

# Group sequences based on the specified grouping levels
target_grouped_sequences = group_sequences(sequences, args.target_grouping)
query_grouped_sequences = group_sequences(sequences, args.query_grouping)

# Generate pairings
pairings = generate_pairings(target_grouped_sequences, query_grouped_sequences, args.num_queries)

# Save or print the pairings or command lines
if wfmash_args:
wfmash_options = ' '.join(wfmash_args)
if args.output:
with open(args.output, 'w') as file:
for target_group, query_groups in pairings:
file.write(f"wfmash {wfmash_options} -T {target_group} -Q {','.join(query_groups)}\n")
else:
for target_group, query_groups in pairings:
print(f"wfmash {wfmash_options} -T {target_group} -Q {','.join(query_groups)}")
else:
if args.output:
with open(args.output, 'w') as file:
for target_group, query_groups in pairings:
file.write(f"{target_group}\t{','.join(query_groups)}\n")
else:
for target_group, query_groups in pairings:
print(f"{target_group}\t{','.join(query_groups)}")

if __name__ == '__main__':
main()
20 changes: 20 additions & 0 deletions scripts/make_source_targball.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
#!/bin/bash

# run from root of the repository

mkdir source-tarball
cd source-tarball
git clone --recursive https://github.com/waveygang/wfmash
cd wfmash
git fetch --tags origin
LATEST_TAG="$(git describe --tags `git rev-list --tags --max-count=1`)"
git checkout "${LATEST_TAG}"
git submodule update --init --recursive
bash scripts/generate_git_version.sh $PWD/src
sed 's/execute_process(COMMAND bash/#execute_process(COMMAND bash/g' CMakeLists.txt -i
rm -Rf .git
find src/common -name ".git" -exec rm -Rf "{}" \;
cd ..
mv wfmash "wfmash-${LATEST_TAG}"
tar -czf "wfmash-${LATEST_TAG}.tar.gz" "wfmash-${LATEST_TAG}"
rm -Rf "wfmash-${LATEST_TAG}"
53 changes: 53 additions & 0 deletions src/common/seqiter.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -106,4 +106,57 @@ void for_each_seq_in_file(
}
}

void for_each_seq_in_file(
faidx_t* fai,
const std::vector<std::string>& seq_names,
const std::function<void(const std::string&, const std::string&)>& func) {
for (const auto& seq_name : seq_names) {
int len;
char* seq = fai_fetch(fai, seq_name.c_str(), &len);
if (seq != nullptr) {
func(seq_name, std::string(seq));
free(seq);
}
}
}

void for_each_seq_in_file_filtered(
const std::string& filename,
const std::vector<std::string>& query_prefix,
const std::unordered_set<std::string>& query_list,
const std::function<void(const std::string&, const std::string&)>& func) {
faidx_t* fai = fai_load(filename.c_str());
if (fai == nullptr) {
std::cerr << "Error: Failed to load FASTA index for file " << filename << std::endl;
return;
}

std::vector<std::string> query_seq_names;
int num_seqs = faidx_nseq(fai);
for (int i = 0; i < num_seqs; i++) {
const char* seq_name = faidx_iseq(fai, i);
bool prefix_skip = true;
for (const auto& prefix : query_prefix) {
if (strncmp(seq_name, prefix.c_str(), prefix.size()) == 0) {
prefix_skip = false;
break;
}
}
if (!query_prefix.empty() && prefix_skip) {
continue;
}
if (!query_list.empty() && query_list.count(seq_name) == 0) {
continue;
}
query_seq_names.push_back(seq_name);
}

for_each_seq_in_file(
fai,
query_seq_names,
func);

fai_destroy(fai);
}

} // namespace seqiter
11 changes: 4 additions & 7 deletions src/common/wflign/src/wflign.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -695,6 +695,7 @@ void WFlign::wflign_affine_wavefront(

// Free
delete wflambda_aligner;
delete wf_aligner;

#ifdef WFA_PNG_TSV_TIMING
if (extend_data.emit_png) {
Expand Down Expand Up @@ -972,9 +973,6 @@ void WFlign::wflign_affine_wavefront(
}

if (merge_alignments) {
// Free old aligner
delete wf_aligner;

// use biWFA for all patching
wfa::WFAlignerGapAffine2Pieces* wf_aligner =
new wfa::WFAlignerGapAffine2Pieces(
Expand Down Expand Up @@ -1029,7 +1027,9 @@ void WFlign::wflign_affine_wavefront(
emit_patching_tsv,
out_patching_tsv
#endif
);
);

delete wf_aligner;
} else {
// todo old implementation (and SAM format is not supported)
for (auto x = trace.rbegin(); x != trace.rend(); ++x) {
Expand All @@ -1050,9 +1050,6 @@ void WFlign::wflign_affine_wavefront(
}
}
}

// Free
delete wf_aligner;
}
}

Expand Down
40 changes: 23 additions & 17 deletions src/interface/parse_args.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -64,9 +64,9 @@ void parse_args(int argc,
args::Group mandatory_opts(parser, "[ MANDATORY OPTIONS ]");
args::Positional<std::string> target_sequence_file(mandatory_opts, "target", "alignment target/reference sequence file");

args::Group io_opts(parser, "[ Files IO Options ]");
args::PositionalList<std::string> query_sequence_files(io_opts, "queries", "query sequences file");
args::ValueFlag<std::string> query_sequence_file_list(io_opts, "queries", "alignment queries files list", {'Q', "query-file-list"});
args::Group io_opts(parser, "[ Files IO Options ]");
args::PositionalList<std::string> query_sequence_files(io_opts, "queries", "query sequence file(s)");
//args::ValueFlag<std::string> query_sequence_file_list(io_opts, "queries", "alignment queries files list", {'Q', "query-file-list"});

args::Group mapping_opts(parser, "[ Mapping Options ]");
args::ValueFlag<float> map_pct_identity(mapping_opts, "%", "percent identity in the mashmap step [default: 90]", {'p', "map-pct-id"});
Expand All @@ -80,8 +80,10 @@ void parse_args(int argc,
args::Flag skip_self(mapping_opts, "", "skip self mappings when the query and target name is the same (for all-vs-all mode)", {'X', "skip-self"});
args::Flag one_to_one(mapping_opts, "", "Perform one-to-one filtering", {'4', "one-to-one"});
args::ValueFlag<char> skip_prefix(mapping_opts, "C", "skip mappings when the query and target have the same prefix before the last occurrence of the given character C", {'Y', "skip-prefix"});
args::ValueFlag<std::string> target_prefix(mapping_opts, "pfx", "use only targets whose name starts with this prefix", {'P', "target-prefix"});
args::ValueFlag<std::string> target_list(mapping_opts, "FILE", "file containing list of target sequence names to use", {'A', "target-list"});
args::ValueFlag<std::string> target_prefix(mapping_opts, "pfx", "use only targets whose names start with this prefix", {'T', "target-prefix"});
args::ValueFlag<std::string> target_list(mapping_opts, "FILE", "file containing list of target sequence names to use", {'R', "target-list"});
args::ValueFlag<std::string> query_prefix(mapping_opts, "pfx[,pfx,...]", "use only queries whose names start with these prefixes (comma delimited)", {'Q', "query-prefix"});
args::ValueFlag<std::string> query_list(mapping_opts, "FILE", "file containing list of query sequence names", {'A', "query-list"});
args::Flag approx_mapping(mapping_opts, "approx-map", "skip base-level alignment, producing an approximate mapping in PAF", {'m',"approx-map"});
args::Flag no_split(mapping_opts, "no-split", "disable splitting of input sequences during mapping [default: enabled]", {'N',"no-split"});
args::ValueFlag<std::string> chain_gap(mapping_opts, "N", "chain mappings closer than this distance in query and target, sets approximate maximum variant length detectable in alignment [default: 4*segment_length, up to 20k]", {'c', "chain-gap"});
Expand All @@ -102,19 +104,19 @@ void parse_args(int argc,
args::Group alignment_opts(parser, "[ Alignment Options ]");
args::ValueFlag<std::string> align_input_paf(alignment_opts, "FILE", "derive precise alignments for this input PAF", {'i', "input-paf"});
args::Flag invert_filtering(alignment_opts, "A", "if an input PAF is specified, remove alignments with gap-compressed identity below --map-pct-id x 0.8, else keep all alignments "
"[default: if an input PAF is specified, keep all alignments, else remove alignments with gap-compressed identity below --map-pct-id x 0.8]",
"[default: if an input PAF is specified, keep all alignments, else remove alignments with gap-compressed identity below --map-pct-id x 0.8]",
{'O', "invert-filtering"});
args::ValueFlag<uint16_t> wflambda_segment_length(alignment_opts, "N", "wflambda segment length: size (in bp) of segment mapped in hierarchical WFA problem [default: 256]", {'W', "wflamda-segment"});
args::ValueFlag<std::string> wfa_score_params(alignment_opts, "mismatch,gap1,ext1",
"score parameters for the wfa alignment (affine); match score is fixed at 0 [default: 6,8,1]",
{"wfa-params"});
"score parameters for the wfa alignment (affine); match score is fixed at 0 [default: 6,8,1]",
{"wfa-params"});
args::ValueFlag<std::string> wfa_patching_score_params(alignment_opts, "mismatch,gap1,ext1,gap2,ext2",
"score parameters for the wfa patching alignment (convex); match score is fixed at 0 [default: 5,8,2,49,1]",
{"wfa-patching-params"});
"score parameters for the wfa patching alignment (convex); match score is fixed at 0 [default: 5,8,2,49,1]",
{"wfa-patching-params"});
//wflign parameters
args::ValueFlag<std::string> wflign_score_params(alignment_opts, "mismatch,gap1,ext1",
"score parameters for the wflign alignment (affine); match score is fixed at 0 [default: 4,6,1]",
{"wflign-params"});
"score parameters for the wflign alignment (affine); match score is fixed at 0 [default: 4,6,1]",
{"wflign-params"});
args::ValueFlag<float> wflign_max_mash_dist(alignment_opts, "N", "maximum mash distance to perform the alignment in a wflambda segment [default: adaptive with respect to the estimated identity]", {'b', "max-mash-dist"});
args::ValueFlag<int> wflign_min_wavefront_length(alignment_opts, "N", "min wavefront length for heuristic WFlign [default: 1024]", {'j', "wflign-min-wf-len"});
args::ValueFlag<int> wflign_max_distance_threshold(alignment_opts, "N", "max distance threshold for heuristic WFlign [default: 2048/(estimated_identity^2)]", {'q', "wflign-max-distance"});
Expand All @@ -139,7 +141,7 @@ void parse_args(int argc,

#ifdef WFA_PNG_TSV_TIMING
args::Group debugging_opts(parser, "[ Debugging Options ]");
args::ValueFlag<std::string> prefix_wavefront_info_in_tsv(parser, "PREFIX", " write wavefronts' information for each alignment in TSV format files with this PREFIX", {'T', "tsv"});
args::ValueFlag<std::string> prefix_wavefront_info_in_tsv(parser, "PREFIX", " write wavefronts' information for each alignment in TSV format files with this PREFIX", {'G', "tsv"});
args::ValueFlag<std::string> prefix_wavefront_plot_in_png(parser, "PREFIX", " write wavefronts' plot for each alignment in PNG format files with this PREFIX", {'u', "prefix-png"});
args::ValueFlag<uint64_t> wfplot_max_size(parser, "N", "max size of the wfplot [default: 1500]", {'z', "wfplot-max-size"});
args::ValueFlag<std::string> path_patching_info_in_tsv(parser, "FILE", " write patching information for each alignment in TSV format in FILE", {"path-patching-tsv"});
Expand Down Expand Up @@ -202,6 +204,14 @@ void parse_args(int argc,
if (target_prefix) {
map_parameters.target_prefix = args::get(target_prefix);
}

if (query_list) {
map_parameters.query_list = args::get(query_list);
}

if (query_prefix) {
map_parameters.query_prefix = skch::CommonFunc::split(args::get(query_prefix), ',');
}

if (target_sequence_file) {
map_parameters.refSequences.push_back(args::get(target_sequence_file));
Expand All @@ -215,10 +225,6 @@ void parse_args(int argc,
align_parameters.querySequences.push_back(q);
}
}
if (query_sequence_file_list) {
skch::parseFileList(args::get(query_sequence_file_list), map_parameters.querySequences);
skch::parseFileList(args::get(query_sequence_file_list), align_parameters.querySequences);
}

if (target_sequence_file && map_parameters.querySequences.empty()
&& map_parameters.refSequences.size() == 1
Expand Down
Loading

0 comments on commit 517e1bc

Please sign in to comment.