Skip to content

Commit

Permalink
Fully switching to weight-based mapping decision. Some fixes in scripts
Browse files Browse the repository at this point in the history
  • Loading branch information
canfirtina committed Aug 28, 2023
1 parent 95217a3 commit a09971c
Show file tree
Hide file tree
Showing 8 changed files with 118 additions and 52 deletions.
10 changes: 5 additions & 5 deletions src/Makefile
Original file line number Diff line number Diff line change
@@ -1,16 +1,16 @@
#pass DEBUG=1 to make to enable debugging
ifdef DEBUG
CFLAGS=-Wall -O2 -fsanitize=address -g -Wc++-compat -DHAVE_KALLOC
CPPFLAGS=--std=c++11 -Wall -O2 -fsanitize=address -g -fopenmp -march=native -DHAVE_KALLOC
CFLAGS=-Wall -O2 -fsanitize=address -g -Wc++-compat -fopenmp -march=native -pthread -DHAVE_KALLOC
CPPFLAGS=--std=c++11 -Wall -O2 -fsanitize=address -g -fopenmp -march=native -pthread -DHAVE_KALLOC
else
CFLAGS=-Wall -O3 -Wc++-compat -DHAVE_KALLOC
CFLAGS=-Wall -O3 -Wc++-compat -fopenmp -march=native -pthread -DHAVE_KALLOC
CPPFLAGS=-std=c++11 -Wall -O3 -fopenmp -march=native -pthread -DHAVE_KALLOC
endif

#pass PROFILE=1 to make to enable profiling
ifdef PROFILE
CFLAGS+=-g -fno-omit-frame-pointer -DPROFILERH=1
CPPFLAGS+=-g -fno-omit-frame-pointer -DPROFILERH=1
CFLAGS+=-g -fno-omit-frame-pointer -march=native -DPROFILERH=1
CPPFLAGS+=-g -fno-omit-frame-pointer -march=native -DPROFILERH=1
endif

OBJS= kthread.o kalloc.o bseq.o roptions.o sequence_until.o rutils.o rsig.o revent.o rsketch.o rindex.o lchain.o rseed.o rmap.o hit.o main.o
Expand Down
20 changes: 10 additions & 10 deletions src/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -211,11 +211,11 @@ int main(int argc, char *argv[])
else if (c == 319) opt.bw_long = atoi(o.arg);// --bw-long
else if (c == 320) opt.max_num_chunk = atoi(o.arg);// --max-chunks
else if (c == 321) opt.min_mapq = atoi(o.arg);// --min-mapq
else if (c == 322) opt.min_bestmapq = atoi(o.arg);// --min-bestmapq
else if (c == 323) opt.min_bestmapq_ratio = atof(o.arg);// --min-bestmapq-ratio
else if (c == 324) opt.min_bestchain_ratio = atof(o.arg);// --min-bestchain-ratio
else if (c == 325) opt.min_meanmapq_ratio = atof(o.arg);// --min-meanmapq-ratio
else if (c == 326) opt.min_meanchain_ratio = atof(o.arg);// --min-meanchain-ratio
// else if (c == 322) opt.min_bestmapq = atoi(o.arg);// --min-bestmapq
// else if (c == 323) opt.min_bestmapq_ratio = atof(o.arg);// --min-bestmapq-ratio
// else if (c == 324) opt.min_bestchain_ratio = atof(o.arg);// --min-bestchain-ratio
// else if (c == 325) opt.min_meanmapq_ratio = atof(o.arg);// --min-meanmapq-ratio
// else if (c == 326) opt.min_meanchain_ratio = atof(o.arg);// --min-meanchain-ratio
else if (c == 327) {opt.bp_per_sec = atoi(o.arg); opt.sample_per_base = (float)opt.sample_rate / opt.bp_per_sec;}// --bp-per-sec
else if (c == 328) {opt.sample_rate = atoi(o.arg); opt.sample_per_base = (float)opt.sample_rate / opt.bp_per_sec;}// --sample-rate
else if (c == 329) opt.chunk_size = atoi(o.arg);// --chunk-size
Expand Down Expand Up @@ -286,11 +286,11 @@ int main(int argc, char *argv[])
fprintf(fp_help, "\n Mapping Decisions (Mapping and sequencing is stopped after taking any of these decisions):\n");
fprintf(fp_help, " --max-chunks INT stop mapping (read not mapped) after sequencing INT number of chunks [%u]\n", opt.max_num_chunk);
fprintf(fp_help, " --min-mapq INT map the read if there is only one chain and its MAPQ > INT [%d]\n", opt.min_mapq);
fprintf(fp_help, " --min-bestmapq INT map the read if the best chain MAPQ > INT and the second-best chain MAPQ == 0 [%d]\n", opt.min_bestmapq);
fprintf(fp_help, " --min-bestmapq-ratio FLOAT map the read if the MAPQ ratio between the best and the second-best chain is larger than or equal to FLOAT [%g]\n", opt.min_bestmapq_ratio);
fprintf(fp_help, " --min-bestchain-ratio FLOAT map the read if the chain score ratio between the best and the second-best chain is larger than or equal to FLOAT [%g]\n", opt.min_bestchain_ratio);
fprintf(fp_help, " --min-meanmapq-ratio FLOAT map the read if the MAPQ ratio between the best and all chains is larger than or equal to FLOAT [%g]\n", opt.min_meanmapq_ratio);
fprintf(fp_help, " --min-meanchain-ratio FLOAT map the read if the chain score ratio between the best and all chains is larger than or equal to FLOAT [%g]\n", opt.min_meanchain_ratio);
// fprintf(fp_help, " --min-bestmapq INT map the read if the best chain MAPQ > INT and the second-best chain MAPQ == 0 [%d]\n", opt.min_bestmapq);
// fprintf(fp_help, " --min-bestmapq-ratio FLOAT map the read if the MAPQ ratio between the best and the second-best chain is larger than or equal to FLOAT [%g]\n", opt.min_bestmapq_ratio);
// fprintf(fp_help, " --min-bestchain-ratio FLOAT map the read if the chain score ratio between the best and the second-best chain is larger than or equal to FLOAT [%g]\n", opt.min_bestchain_ratio);
// fprintf(fp_help, " --min-meanmapq-ratio FLOAT map the read if the MAPQ ratio between the best and all chains is larger than or equal to FLOAT [%g]\n", opt.min_meanmapq_ratio);
// fprintf(fp_help, " --min-meanchain-ratio FLOAT map the read if the chain score ratio between the best and all chains is larger than or equal to FLOAT [%g]\n", opt.min_meanchain_ratio);

fprintf(fp_help, "\n Nanopore Parameters:\n");
fprintf(fp_help, " --bp-per-sec INT DNA molecules transiting through the pore (bp per second) [%u]\n", opt.bp_per_sec);
Expand Down
32 changes: 16 additions & 16 deletions src/rmap.c
Original file line number Diff line number Diff line change
Expand Up @@ -261,39 +261,39 @@ static void map_worker_for(void *_data,
float bestQ = reg0->creg[0].mapq, best2Q = reg0->creg[1].mapq;
float bestC = reg0->creg[0].score, best2C = reg0->creg[1].score;

if (bestQ > opt->min_bestmapq && best2Q == 0) {is_mapped = 1; break;}
// if (bestQ > opt->min_bestmapq && best2Q == 0) {is_mapped = 1; break;}

float meanC = 0;
float meanQ = 0;
int non_zero_mapq = 0;
for (uint32_t c_ind = 0; c_ind < reg0->n_cregs; ++c_ind){
meanC += reg0->creg[c_ind].score;
meanQ += reg0->creg[c_ind].mapq;
if(reg0->creg[c_ind].mapq > 0) ++non_zero_mapq;
// if(reg0->creg[c_ind].mapq > 0) ++non_zero_mapq;
}

meanC /= reg0->n_cregs;
meanQ /= reg0->n_cregs;

//TODO: Experimental weighted sum technique for the next commits
// float r_bestq = bestQ/(1+bestQ);
// float r_best2q = (best2Q > 0)?(bestQ/best2Q)/(1+(bestQ/best2Q)):1.0;
// float r_best2c = (best2C > 0)?(bestC/best2C)/(1+(bestC/best2C)):1.0;
// float r_bestmq = (meanQ > 0)?(bestQ/meanQ)/(1+(bestQ/meanQ)):0.0;
// float r_bestmc = (meanC > 0)?(bestC/meanC)/(1+(bestC/meanC)):0.0;
float r_bestq = bestQ/(1+bestQ);
float r_best2q = (best2Q > 0)?(bestQ/best2Q)/(1+(bestQ/best2Q)):1.0;
float r_best2c = (best2C > 0)?(bestC/best2C)/(1+(bestC/best2C)):1.0;
float r_bestmq = (meanQ > 0)?(bestQ/meanQ)/(1+(bestQ/meanQ)):0.0;
float r_bestmc = (meanC > 0)?(bestC/meanC)/(1+(bestC/meanC)):0.0;

// // Compute the weighted sum
// float weighted_sum = opt->w_bestq*r_bestq + opt->w_best2q*r_best2q +
// opt->w_best2c*r_best2c + opt->w_bestmq*r_bestmq + opt->w_bestmc*r_bestmc;
// Compute the weighted sum
float weighted_sum = opt->w_bestq*r_bestq + opt->w_best2q*r_best2q + opt->w_bestmq*r_bestmq +
opt->w_best2c*r_best2c + opt->w_bestmc*r_bestmc;

// // Compare the weighted sum against a threshold to make the decision
// if (weighted_sum >= opt->w_threshold) {is_mapped = 1; break;}
// Compare the weighted sum against a threshold to make the decision
if (weighted_sum >= opt->w_threshold) {is_mapped = 1; break;}

if(non_zero_mapq < 3 && best2Q > 0 && bestQ/best2Q >= opt->min_bestmapq_ratio) {is_mapped = 1; break;}
if(non_zero_mapq < 3 && best2C > 0 && bestC/best2C >= opt->min_bestchain_ratio) {is_mapped = 1; break;}
// if(non_zero_mapq < 3 && best2Q > 0 && bestQ/best2Q >= opt->min_bestmapq_ratio) {is_mapped = 1; break;}
// if(non_zero_mapq < 3 && best2C > 0 && bestC/best2C >= opt->min_bestchain_ratio) {is_mapped = 1; break;}

if (non_zero_mapq > 2 && meanQ > 0 && bestQ >= opt->min_meanmapq_ratio * meanQ) {is_mapped = 1; break;}
if (bestQ > 0 && meanC > 0 && bestC >= opt->min_meanchain_ratio * meanC) {is_mapped = 1; break;}
// if (non_zero_mapq > 2 && meanQ > 0 && bestQ >= opt->min_meanmapq_ratio * meanQ) {is_mapped = 1; break;}
// if (bestQ > 0 && meanC > 0 && bestC >= opt->min_meanchain_ratio * meanC) {is_mapped = 1; break;}

//TODO: increase score for those with query ending towards the end (i.e., meaning they benefited from more recent sequencing)
} else if (reg0->n_cregs == 1 && reg0->creg[0].mapq >= opt->min_mapq) {is_mapped = 1; break;}
Expand Down
8 changes: 4 additions & 4 deletions src/roptions.c
Original file line number Diff line number Diff line change
Expand Up @@ -83,12 +83,12 @@ void ri_mapopt_init(ri_mapopt_t *opt)
opt->max_num_chunk = 15;//--max-chunks
// opt->min_chain_anchor = 10;

opt->min_bestmapq = 5; //--min-bestmapq
opt->min_mapq = 2; //--min-mapq
opt->min_bestmapq_ratio = 2.0f, opt->min_meanmapq_ratio = 5.0f; //--min-bestmapq-ratio, --min-meanmapq-ratio
// opt->min_bestmapq = 5; //--min-bestmapq
// opt->min_bestmapq_ratio = 2.0f, opt->min_meanmapq_ratio = 5.0f; //--min-bestmapq-ratio, --min-meanmapq-ratio

opt->min_bestchain_ratio = 2.5f; //--min-bestchain-ratio
opt->min_meanchain_ratio = 2.5f; //--min-meanchain-ratio
// opt->min_bestchain_ratio = 2.5f; //--min-bestchain-ratio
// opt->min_meanchain_ratio = 2.5f; //--min-meanchain-ratio

//Default options for event detection.
opt->window_length1 = 3; //--seg-window-length1
Expand Down
8 changes: 4 additions & 4 deletions src/roptions.h
Original file line number Diff line number Diff line change
Expand Up @@ -92,10 +92,10 @@ typedef struct ri_mapopt_s{
uint32_t max_num_chunk;
// uint32_t min_chain_anchor;

int min_mapq, min_bestmapq;
float min_bestmapq_ratio, min_meanmapq_ratio;

float min_bestchain_ratio, min_meanchain_ratio;
int min_mapq;
// int min_bestmapq;
// float min_bestmapq_ratio, min_meanmapq_ratio;
// float min_bestchain_ratio, min_meanchain_ratio;

float t_threshold;
uint32_t tn_samples;
Expand Down
8 changes: 4 additions & 4 deletions test/data/download_d3_yeast.sh
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,9 @@ wget -qO- https://sra-pub-src-1.s3.amazonaws.com/SRR8648503/GLU1II_basecalled_fa
find ./GLU1II_basecalled_fast5_1 -type f -name '*.fast5' | head -50000 | xargs -i{} mv {} ./fast5_files/; rm -rf GLU1II_basecalled_fast5_1;

#To extract the reads from FAST5 from this dataset, you will need to clone the following repository and make sure you have h5py <= 2.9 (if you have conda you can do the following):
# conda create -n oldh5 h5py=2.9.0; conda activate oldh5;
# git clone https://github.com/rrwick/Fast5-to-Fastq
# Fast5-to-Fastq/fast5_to_fastq.py fast5_files/ | awk 'BEGIN{line = 0}{line++; if(line %4 == 1){print ">"substr($1,2,36)}else if(line % 4 == 2){print $0}}' > reads.fasta
conda create -n oldh5 h5py=2.9.0; conda activate oldh5;
git clone https://github.com/rrwick/Fast5-to-Fastq
Fast5-to-Fastq/fast5_to_fastq.py fast5_files/ | awk 'BEGIN{line = 0}{line++; if(line %4 == 1){print ">"substr($1,2,36)}else if(line % 4 == 2){print $0}}' > reads.fasta

# Optional: Single to multi conversion. Requires the following tool: https://github.com/nanoporetech/ont_fast5_api
# single_to_multi_fast5 --input_path ./fast5_files/ --save_path ./multi_fast5_files/ -t 32
Expand All @@ -22,7 +22,7 @@ find ./GLU1II_basecalled_fast5_1 -type f -name '*.fast5' | head -50000 | xargs -
# pod5 convert fast5 -r -t 32 --output-one-to-one ./fast5_files/ ./fast5_files/ ./pod5_files/

#We have provided the Zenodo link to this reads.fasta file to avoid the hassle above
wget https://zenodo.org/record/7582018/files/reads.fasta
# wget https://zenodo.org/record/7582018/files/reads.fasta

#Downloading S.cerevisiae S288c (Yeast) reference genome from UCSC
wget https://hgdownload.soe.ucsc.edu/goldenPath/sacCer3/bigZips/sacCer3.fa.gz; gunzip sacCer3.fa.gz; mv sacCer3.fa ref.fa;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,12 @@
uncalled pafstats -r ../true_mappings.paf --annotate ../uncalled/relative_abundance_uncalled.paf > relative_abundance_uncalled_ann.paf 2> relative_abundance_uncalled.throughput
uncalled pafstats -r ../true_mappings.paf --annotate ../sigmap/relative_abundance_sigmap.paf > relative_abundance_sigmap_ann.paf 2> relative_abundance_sigmap.throughput
uncalled pafstats -r ../true_mappings.paf --annotate ../rawhash/relative_abundance_rawhash_fast.paf > relative_abundance_rawhash_fast_ann.paf 2> relative_abundance_rawhash_fast.throughput
uncalled pafstats -r ../true_mappings.paf --annotate ../rawhash/relative_abundance_w3_rawhash_fast.paf > relative_abundance_w3_rawhash_fast_ann.paf 2> relative_abundance_w3_rawhash_fast.throughput

python ../../../scripts/compare_pafs.py relative_abundance_uncalled_ann.paf relative_abundance_sigmap_ann.paf relative_abundance_rawhash_fast_ann.paf > relative_abundance_rawhash_fast.comparison
python ../../../scripts/compare_pafs.py relative_abundance_uncalled_ann.paf relative_abundance_sigmap_ann.paf relative_abundance_w3_rawhash_fast_ann.paf > relative_abundance_w3_rawhash_fast.comparison

for i in '../true_mappings.paf' '../uncalled/relative_abundance_uncalled.paf' '../sigmap/relative_abundance_sigmap.paf' '../rawhash/relative_abundance_rawhash_fast.paf' ; do
for i in '../true_mappings.paf' '../uncalled/relative_abundance_uncalled.paf' '../sigmap/relative_abundance_sigmap.paf' '../rawhash/relative_abundance_rawhash_fast.paf' '../rawhash/relative_abundance_w3_rawhash_fast.paf' ; do
if test -f $i; then
fname=`basename $i | sed 's/.paf/.abundance/g'`;
awk 'BEGIN{tot=0; covid=0; human=0; ecoli=0; yeast=0; algae=0; totsum=0; covidsum=0; humansum=0; ecolisum=0; yeastsum=0; algaesum=0;}{
Expand Down
80 changes: 72 additions & 8 deletions test/evaluation/relative_abundance/comparison/2_output_results.sh
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,71 @@ echo "Sigmap throughput (mean/median)"
grep "BP per sec:" relative_abundance_sigmap.throughput
echo "RawHash throughput (mean/median)"
grep "BP per sec:" relative_abundance_rawhash_fast.throughput
echo "RawHash (Minimizer w = 3) throughput (mean/median)"
grep "BP per sec:" relative_abundance_w3_rawhash_fast.throughput
# echo "RawHash (Minimizer w = 5) throughput (mean/median)"
# grep "BP per sec:" relative_abundance_w5_rawhash_fast.throughput

echo;
grep "Uncalled Mean time per read" relative_abundance_rawhash_fast.comparison
grep "Sigmap Mean time per read" relative_abundance_rawhash_fast.comparison
grep "RawHash Mean time per read" relative_abundance_rawhash_fast.comparison
echo "(Minimizer w = 3)"
grep "RawHash Mean time per read" relative_abundance_w3_rawhash_fast.comparison
# echo "(Minimizer w = 5)"
# grep "RawHash Mean time per read" relative_abundance_w5_rawhash_fast.comparison

# echo;
# grep "RawHash Mean gap between read anchors in the best chain" relative_abundance_rawhash_fast.comparison
# grep "RawHash Mean gap between reference anchors in the best chain" relative_abundance_rawhash_fast.comparison

echo;
grep "Uncalled Mean # of sequenced bases per read" relative_abundance_rawhash_fast.comparison
grep "RawHash Mean # of sequenced bases per read" relative_abundance_rawhash_fast.comparison
echo "(Minimizer w = 3)"
grep "RawHash Mean # of sequenced bases per read" relative_abundance_w3_rawhash_fast.comparison
# echo "(Minimizer w = 5)"
# grep "RawHash Mean # of sequenced bases per read" relative_abundance_w5_rawhash_fast.comparison

echo;
grep "Sigmap Mean # of sequenced chunks per read" relative_abundance_rawhash_fast.comparison
grep "RawHash Mean # of sequenced chunks per read" relative_abundance_rawhash_fast.comparison
echo "(Minimizer w = 3)"
grep "RawHash Mean # of sequenced chunks per read" relative_abundance_w3_rawhash_fast.comparison
# echo "(Minimizer w = 5)"
# grep "RawHash Mean # of sequenced chunks per read" relative_abundance_w5_rawhash_fast.comparison

echo;
grep "Uncalled Mean (only mapped) # of sequenced bases per read" relative_abundance_rawhash_fast.comparison
grep "RawHash Mean (only mapped) # of sequenced bases per read" relative_abundance_rawhash_fast.comparison
echo "(Minimizer w = 3)"
grep "RawHash Mean (only mapped) # of sequenced bases per read" relative_abundance_w3_rawhash_fast.comparison
# echo "(Minimizer w = 5)"
# grep "RawHash Mean (only mapped) # of sequenced bases per read" relative_abundance_w5_rawhash_fast.comparison

echo;
grep "Sigmap Mean (only mapped) # of sequenced chunks per read" relative_abundance_rawhash_fast.comparison
grep "RawHash Mean (only mapped) # of sequenced chunks per read" relative_abundance_rawhash_fast.comparison
echo "(Minimizer w = 3)"
grep "RawHash Mean (only mapped) # of sequenced chunks per read" relative_abundance_w3_rawhash_fast.comparison
# echo "(Minimizer w = 5)"
# grep "RawHash Mean (only mapped) # of sequenced chunks per read" relative_abundance_w5_rawhash_fast.comparison

echo;
grep "Uncalled Mean (only unmapped) # of sequenced bases per read" relative_abundance_rawhash_fast.comparison
grep "RawHash Mean (only unmapped) # of sequenced bases per read" relative_abundance_rawhash_fast.comparison
echo "(Minimizer w = 3)"
grep "RawHash Mean (only unmapped) # of sequenced bases per read" relative_abundance_w3_rawhash_fast.comparison
# echo "(Minimizer w = 5)"
# grep "RawHash Mean (only unmapped) # of sequenced bases per read" relative_abundance_w5_rawhash_fast.comparison

echo;
grep "Sigmap Mean (only unmapped) # of sequenced chunks per read" relative_abundance_rawhash_fast.comparison
grep "RawHash Mean (only unmapped) # of sequenced chunks per read" relative_abundance_rawhash_fast.comparison
echo "(Minimizer w = 3)"
grep "RawHash Mean (only unmapped) # of sequenced chunks per read" relative_abundance_w3_rawhash_fast.comparison
# echo "(Minimizer w = 5)"
# grep "RawHash Mean (only unmapped) # of sequenced chunks per read" relative_abundance_w5_rawhash_fast.comparison

echo;
echo '(Indexing) Timing and memory usage results:'
Expand Down Expand Up @@ -53,25 +113,29 @@ for i in `echo ../*/*_map_*.time`; do echo $i;
}
}' $i; done

echo;
echo "Uncalled relative abundance: "
cat relative_abundance_uncalled.abundance
echo "Sigmap relative abundance: "
cat relative_abundance_sigmap.abundance
echo "RawHash relative abundance: "
cat relative_abundance_rawhash_fast.abundance

echo;
grep "Uncalled precision:" relative_abundance_rawhash_fast.comparison
grep "Sigmap precision:" relative_abundance_rawhash_fast.comparison
grep "RawHash precision:" relative_abundance_rawhash_fast.comparison
echo "(Minimizer w = 3)"
grep "RawHash precision:" relative_abundance_w3_rawhash_fast.comparison
# echo "(Minimizer w = 5)"
# grep "RawHash precision:" relative_abundance_w5_rawhash_fast.comparison

echo;
grep "Uncalled recall:" relative_abundance_rawhash_fast.comparison
grep "Sigmap recall:" relative_abundance_rawhash_fast.comparison
grep "RawHash recall:" relative_abundance_rawhash_fast.comparison
echo "(Minimizer w = 3)"
grep "RawHash recall:" relative_abundance_w3_rawhash_fast.comparison
# echo "(Minimizer w = 5)"
# grep "RawHash recall:" relative_abundance_w5_rawhash_fast.comparison

echo;
grep "Uncalled F-1 score:" relative_abundance_rawhash_fast.comparison
grep "Sigmap F-1 score:" relative_abundance_rawhash_fast.comparison
grep "RawHash F-1 score:" relative_abundance_rawhash_fast.comparison
echo "(Minimizer w = 3)"
grep "RawHash F-1 score:" relative_abundance_w3_rawhash_fast.comparison
# echo "(Minimizer w = 5)"
# grep "RawHash F-1 score:" relative_abundance_w5_rawhash_fast.comparison

0 comments on commit a09971c

Please sign in to comment.