Skip to content

Commit

Permalink
update
Browse files Browse the repository at this point in the history
  • Loading branch information
ban-m committed Feb 1, 2023
1 parent f30de7d commit 91ba125
Show file tree
Hide file tree
Showing 16 changed files with 106 additions and 102 deletions.
4 changes: 2 additions & 2 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion haplotyper/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -22,5 +22,5 @@ histgram_viz = {git = "https://github.com/ban-m/histgram_viz", branch = "master"
gfa = {git = "https://github.com/ban-m/gfa_rust.git", branch = "master"}
edlib_sys = {git = "https://github.com/ban-m/edlib-sys.git", branch = "master"}
bytecount = "0.6.2"
kiley = {git = "https://github.com/ban-m/kiley.git", branch = "master"}
kiley = {git = "https://github.com/ban-m/kiley.git", branch = "master", version = "0.2.0"}
# kiley = {path = "../../kiley"}
9 changes: 5 additions & 4 deletions haplotyper/src/consensus/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ use crate::assemble::ditch_graph::UnitAlignmentInfo;
use crate::model_tune::ModelFit;
use definitions::*;
use gfa::Segment;
use kiley::hmm::guided::PairHiddenMarkovModelOnStrands;
use kiley::hmm::PairHiddenMarkovModelOnStrands;
use kiley::Op;
use rand::prelude::SliceRandom;
use rand::Rng;
Expand Down Expand Up @@ -535,7 +535,8 @@ fn global_align(query: &[u8], target: &[u8]) -> Vec<Op> {
}

fn bootstrap_consensus(seqs: &[&[u8]], ops: &mut [Vec<Op>], radius: usize) -> Vec<u8> {
let draft = kiley::ternary_consensus_by_chunk(seqs, radius);
// let draft = kiley::ternary_consensus_by_chunk(seqs, radius);
let draft = seqs[0].to_vec();
for (seq, ops) in std::iter::zip(seqs, ops.iter_mut()) {
*ops = global_align(seq, &draft);
}
Expand Down Expand Up @@ -1239,7 +1240,7 @@ fn check(query: &[u8], seq: &[u8], is_forward: bool) {
let (start, end) = aln.location().unwrap();
let ops: Vec<_> = crate::misc::edlib_to_kiley(aln.operations().unwrap());
let refr = &seq[start..end + 1];
let (r, a, q) = kiley::recover(refr, query, &ops);
let (r, a, q) = kiley::op::recover(refr, query, &ops);
for ((r, a), q) in r.chunks(200).zip(a.chunks(200)).zip(q.chunks(200)) {
eprintln!("{}", std::str::from_utf8(r).unwrap());
eprintln!("{}", std::str::from_utf8(a).unwrap());
Expand All @@ -1255,7 +1256,7 @@ fn check(query: &[u8], seq: &[u8], is_forward: bool) {
let (start, end) = aln.location().unwrap();
let ops = crate::misc::edlib_to_kiley(aln.operations().unwrap());
let refr = &rev[start..end + 1];
let (r, a, q) = kiley::recover(refr, query, &ops);
let (r, a, q) = kiley::op::recover(refr, query, &ops);
for ((r, a), q) in r.chunks(200).zip(a.chunks(200)).zip(q.chunks(200)) {
eprintln!("{}", std::str::from_utf8(r).unwrap());
eprintln!("{}", std::str::from_utf8(a).unwrap());
Expand Down
3 changes: 2 additions & 1 deletion haplotyper/src/dense_encoding.rs
Original file line number Diff line number Diff line change
Expand Up @@ -548,7 +548,8 @@ fn consensus(mut seqs: Vec<Vec<u8>>, cov_thr: usize) -> Option<Vec<u8>> {
if seqs.len() <= cov_thr {
return None;
}
let draft = kiley::polish_by_pileup(&seqs[0], &seqs[1..]);
let draft = seqs[0].to_vec();
// let draft = kiley::polish_by_pileup(&seqs[0], &seqs[1..]);
let task = edlib_sys::AlignTask::Alignment;
let mode = edlib_sys::AlignMode::Global;
let mut ops: Vec<_> = seqs
Expand Down
10 changes: 5 additions & 5 deletions haplotyper/src/determine_units.rs
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ fn show_minimap2_version() {
let version = std::process::Command::new("minimap2")
.args(["--version"])
.output()
.unwrap();
.expect("Minimap2 is unavailable.");
if !version.status.success() {
error!("Minimap2 is not available. Please install minimap2(https://github.com/lh3/minimap2) first.");
panic!("Minimap2,{:?}", String::from_utf8_lossy(&version.stderr));
Expand Down Expand Up @@ -429,10 +429,10 @@ fn take_consensus<K: Hash + Clone + Eq + Sync + Send>(
.par_iter()
.map(|(key, seqs)| {
let radius = read_type.band_width(config.chunk_len);
let mut draft = pick_median_length(seqs.as_slice());
for _ in 0..2 {
draft = kiley::polish_by_pileup(&draft, seqs);
}
let draft = pick_median_length(seqs.as_slice());
// for _ in 0..2 {
// draft = kiley::polish_by_pileup(&draft, seqs);
// }
let consensus = kiley::bialignment::guided::polish_until_converge(&draft, seqs, radius);
(key.clone(), consensus)
})
Expand Down
3 changes: 1 addition & 2 deletions haplotyper/src/encode/deletion_fill.rs
Original file line number Diff line number Diff line change
Expand Up @@ -341,7 +341,6 @@ fn correct_deletion_error(
new_inserts
}


const THR: f64 = 10f64;
fn try_encoding_head(
nodes: &[Node],
Expand Down Expand Up @@ -496,7 +495,7 @@ fn fine_mapping<'a>(
// let mut ops = vec![kiley::Op::Del; start];
// ops.extend(edlib_op_to_kiley_op(aln.operations().unwrap()));
// ops.extend(vec![kiley::Op::Del; orig_query.len() - end - 1]);
let (xr, ar, yr) = kiley::recover(unitseq, query, &ops);
let (xr, ar, yr) = kiley::op::recover(unitseq, query, &ops);
for ((xr, ar), yr) in xr.chunks(200).zip(ar.chunks(200)).zip(yr.chunks(200)) {
eprintln!("ALN\t{}", String::from_utf8_lossy(xr));
eprintln!("ALN\t{}", String::from_utf8_lossy(ar));
Expand Down
16 changes: 8 additions & 8 deletions haplotyper/src/likelihood_gains.rs
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
use rand::{prelude::SliceRandom, SeedableRng};
use rand_xoshiro::Xoshiro256StarStar;
use rayon::prelude::*;
pub fn estimate_minimum_gain(hmm: &kiley::hmm::guided::PairHiddenMarkovModel) -> f64 {
pub fn estimate_minimum_gain(hmm: &kiley::hmm::PairHiddenMarkovModel) -> f64 {
const SEED: u64 = 23908;
const SAMPLE_NUM: usize = 1000;
const SEQ_NUM: usize = 500;
Expand All @@ -18,8 +18,8 @@ pub fn estimate_minimum_gain(hmm: &kiley::hmm::guided::PairHiddenMarkovModel) ->
let mut lks: Vec<_> = (0..SEQ_NUM)
.map(|_| {
let read = hmm.gen(&hap1, &mut rng);
let lk_base = hmm.likelihood(&hap1, &read, BAND);
let lk_diff = hmm.likelihood(&hap2, &read, BAND);
let lk_base = hmm.likelihood_bootstrap(&hap1, &read, BAND);
let lk_diff = hmm.likelihood_bootstrap(&hap2, &read, BAND);
lk_base - lk_diff
})
.collect();
Expand Down Expand Up @@ -152,7 +152,7 @@ impl Pvalues {
}
}

use kiley::hmm::guided::PairHiddenMarkovModel;
use kiley::hmm::PairHiddenMarkovModel;
pub fn estimate_gain(
hmm: &PairHiddenMarkovModel,
seed: u64,
Expand Down Expand Up @@ -261,8 +261,8 @@ fn gain_of(
let mut lk_diff: Vec<_> = (0..SEQ_NUM)
.map(|_| {
let read = hmm.gen(&diff, &mut rng);
let lk_base = hmm.likelihood(&template, &read, band);
let lk_diff = hmm.likelihood(&diff, &read, band);
let lk_base = hmm.likelihood_bootstrap(&template, &read, band);
let lk_diff = hmm.likelihood_bootstrap(&diff, &read, band);
lk_diff - lk_base
})
.collect();
Expand All @@ -276,8 +276,8 @@ fn gain_of(
let null_prob = (0..SEQ_NUM)
.filter(|_| {
let read = hmm.gen(&template, &mut rng);
let lk_base = hmm.likelihood(&template, &read, band);
let lk_diff = hmm.likelihood(&diff, &read, band);
let lk_base = hmm.likelihood_bootstrap(&template, &read, band);
let lk_diff = hmm.likelihood_bootstrap(&diff, &read, band);
lk_base + min_gain < lk_diff
})
.count();
Expand Down
13 changes: 7 additions & 6 deletions haplotyper/src/local_clustering/kmeans.rs
Original file line number Diff line number Diff line change
Expand Up @@ -45,15 +45,16 @@ pub fn clustering<R: Rng, T: std::borrow::Borrow<[u8]>>(
cluster_num: usize,
band: usize,
) -> ClusteringResult {
let mut template = kiley::ternary_consensus_by_chunk(reads, band);
let mut hmm = kiley::hmm::guided::PairHiddenMarkovModel::default();
let mut template =
kiley::bialignment::guided::polish_until_converge(reads[0].borrow(), reads, band);
let mut hmm = kiley::hmm::PairHiddenMarkovModel::default();
hmm.fit_naive(&template, reads, band);
let gains = crate::likelihood_gains::estimate_gain(&hmm, 4283094, 100, 20, 5);
let cov = reads.len() as f64 / cluster_num as f64;
let config = ClusteringConfig::new(band, cluster_num, cov, cov, &gains);
let mut ops: Vec<_> = reads
.iter()
.map(|x| hmm.align(&template, x.borrow(), band).1)
.map(|x| hmm.align_guided(&template, x.borrow(), band).1)
.collect();
for t in 1..3 {
hmm.fit_naive_with(&template, reads, &ops, band / t);
Expand All @@ -69,7 +70,7 @@ fn modification_table<T: std::borrow::Borrow<[u8]>>(
reads: &[T],
ops: &[Vec<kiley::Op>],
band: usize,
hmm: &kiley::hmm::guided::PairHiddenMarkovModel,
hmm: &kiley::hmm::PairHiddenMarkovModel,
) -> Vec<Vec<f64>> {
reads
.iter()
Expand Down Expand Up @@ -100,7 +101,7 @@ pub fn clustering_dev<R: Rng, T: std::borrow::Borrow<[u8]>>(
ops: &[Vec<kiley::Op>],
strands: &[bool],
rng: &mut R,
hmm: &kiley::hmm::guided::PairHiddenMarkovModel,
hmm: &kiley::hmm::PairHiddenMarkovModel,
config: &ClusteringConfig,
) -> ClusteringDevResult {
if config.copy_num < 2 {
Expand Down Expand Up @@ -136,7 +137,7 @@ pub fn search_variants<R: Rng, T: std::borrow::Borrow<[u8]>>(
ops: &[Vec<kiley::Op>],
strands: &[bool],
rng: &mut R,
hmm: &kiley::hmm::guided::PairHiddenMarkovModel,
hmm: &kiley::hmm::PairHiddenMarkovModel,
config: &ClusteringConfig,
) -> FeatureVector {
let profiles = modification_table(template, reads, ops, config.band_width, hmm);
Expand Down
2 changes: 1 addition & 1 deletion haplotyper/src/local_clustering/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -280,7 +280,7 @@ fn update_by_clusterings(

// TODO: this function is, very very slow. Please fasten this function, please.
// Or, maybe we do not need to tune a pHMM on each chunk. We can just use one pHMM across all the chunks.
type Phmm = kiley::hmm::guided::PairHiddenMarkovModel;
type Phmm = kiley::hmm::PairHiddenMarkovModel;
fn prep_consensus(
hmm: &Phmm,
draft: &[u8],
Expand Down
89 changes: 45 additions & 44 deletions haplotyper/src/model_tune.rs
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
use definitions::*;
use kiley::hmm::guided::HMMConfig;
use std::collections::HashMap;

use kiley::hmm::guided::PairHiddenMarkovModel;
use kiley::hmm::guided::PairHiddenMarkovModelOnStrands;
use kiley::hmm::PairHiddenMarkovModel;
use kiley::hmm::PairHiddenMarkovModelOnStrands;

pub trait ModelFit {
fn fit_models_on_both_strands(&self) -> Option<PairHiddenMarkovModelOnStrands>;
Expand All @@ -18,7 +19,7 @@ impl ModelFit for DataSet {
}
}

pub fn get_model(ds: &DataSet) -> Option<kiley::hmm::guided::PairHiddenMarkovModel> {
pub fn get_model(ds: &DataSet) -> Option<kiley::hmm::PairHiddenMarkovModel> {
ds.model_param.as_ref().map(|param| {
let &HMMParam {
mat_mat,
Expand Down Expand Up @@ -87,27 +88,11 @@ pub fn update_model(ds: &mut DataSet) {
});
}

const TRAIN_UNIT_SIZE: usize = 2;
const TRAIN_UNIT_SIZE: usize = 3;
const TRAIN_ROUND: usize = 3;
fn estimate_model_parameters_on_both_strands(
ds: &DataSet,
) -> Option<PairHiddenMarkovModelOnStrands> {
let model = ds.get_model()?;
let chunks: HashMap<_, _> = ds.selected_chunks.iter().map(|c| (c.id, c)).collect();
let (fpileups, rpileups) = {
let mut fpileups: HashMap<_, _> = chunks.keys().map(|&k| (k, vec![])).collect();
let mut rpileups: HashMap<_, _> = fpileups.clone();
for node in ds.encoded_reads.iter().flat_map(|r| r.nodes.iter()) {
let bucket = match node.is_forward {
true => fpileups.get_mut(&node.unit),
false => rpileups.get_mut(&node.unit),
};
if let Some(bucket) = bucket {
bucket.push(node);
}
}
(truncate_bucket(fpileups), truncate_bucket(rpileups))
};
fn truncate_bucket<T>(pileups: HashMap<u64, Vec<T>>) -> Vec<(u64, Vec<T>)> {
let mut covs: Vec<_> = pileups.values().map(|x| x.len()).collect();
let (_, &mut cov, _) = covs.select_nth_unstable(pileups.len() / 2);
Expand All @@ -117,19 +102,24 @@ fn estimate_model_parameters_on_both_strands(
.take(TRAIN_UNIT_SIZE)
.collect()
}
for (uid, seqs) in fpileups.iter() {
debug!("LOCAL\tSAMPLE\t{uid}\t{}\tFORWARD", seqs.len());
}
for (uid, seqs) in rpileups.iter() {
debug!("LOCAL\tSAMPLE\t{uid}\t{}\tREVERSE", seqs.len());
}
fn train_inner(
let model = ds.get_model()?;
let chunks: HashMap<_, _> = ds.selected_chunks.iter().map(|c| (c.id, c)).collect();
let models = PairHiddenMarkovModelOnStrands::new(model.clone(), model);
let pileups = {
let mut pileups: HashMap<_, _> = chunks.keys().map(|&k| (k, vec![])).collect();
for node in ds.encoded_reads.iter().flat_map(|r| r.nodes.iter()) {
pileups.entry(node.unit).or_default().push(node);
}
truncate_bucket(pileups)
};
let models = train(ds.read_type, &chunks, &pileups, &models);
fn train(
read_type: ReadType,
chunks: &HashMap<u64, &Unit>,
pileups: &[(u64, Vec<&Node>)],
model: &PairHiddenMarkovModel,
) -> PairHiddenMarkovModel {
let mut model = model.clone();
models: &PairHiddenMarkovModelOnStrands,
) -> PairHiddenMarkovModelOnStrands {
let mut models = models.clone();
let mut polishing_pairs: Vec<_> = pileups
.iter()
.filter_map(|(uid, nodes)| {
Expand All @@ -140,31 +130,42 @@ fn estimate_model_parameters_on_both_strands(
.map(|n| crate::misc::ops_to_kiley(&n.cigar))
.collect();
let seqs: Vec<_> = nodes.iter().map(|n| n.seq()).collect();
let strands: Vec<_> = nodes.iter().map(|n| n.is_forward).collect();
let chunk_seq = ref_unit.seq().to_vec();
Some((chunk_seq, seqs, ops, band_width))
Some((chunk_seq, seqs, ops, strands, band_width))
})
.collect();
assert!(!polishing_pairs.is_empty());
use rayon::prelude::*;
let bw = polishing_pairs.iter().map(|x| x.4).max().unwrap();
for _ in 0..TRAIN_ROUND {
for (consensus, seqs, ops, bw) in polishing_pairs.iter_mut() {
use kiley::bialignment::guided;
*consensus = guided::polish_until_converge_with(consensus, seqs, ops, *bw);
*consensus = model.polish_until_converge_with(consensus, seqs, ops, *bw);
model.fit_naive_with_par(consensus, seqs, ops, *bw);
}
polishing_pairs
.par_iter_mut()
.for_each(|(cons, seqs, ops, strands, bw)| {
let config = HMMConfig::new(*bw, seqs.len(), 0);
*cons =
models.polish_until_converge_with_conf(cons, seqs, ops, strands, &config);
});
let training_datapack: Vec<_> = polishing_pairs
.iter()
.map(|(cons, seqs, ops, strands, _)| {
kiley::hmm::TrainingDataPack::new(cons, strands, seqs, ops)
})
.collect();
models.fit_multiple_with_par(&training_datapack, bw);
}
debug!("HMM\n{}", model);
model
debug!("FORWARD\tHMM\n{}", models.forward());
debug!("REVERSE\tHMM\n{}", models.reverse());
models.clone()
}
let forward = train_inner(ds.read_type, &chunks, &fpileups, &model);
let reverse = train_inner(ds.read_type, &chunks, &rpileups, &model);
Some(PairHiddenMarkovModelOnStrands::new(forward, reverse))
Some(models)
}

fn estimate_model_parameters<N: std::borrow::Borrow<Node>>(
read_type: ReadType,
pileups: &HashMap<(u64, u64), Vec<N>>,
chunks: &HashMap<u64, &Unit>,
) -> kiley::hmm::guided::PairHiddenMarkovModel {
) -> kiley::hmm::PairHiddenMarkovModel {
let mut covs: Vec<_> = pileups.iter().map(|x| x.1.len()).collect();
let (_, &mut cov, _) = covs.select_nth_unstable(pileups.len() / 2);
let mut seqs_and_ref_units: Vec<_> = pileups
Expand All @@ -177,7 +178,7 @@ fn estimate_model_parameters<N: std::borrow::Borrow<Node>>(
for (chunk, units) in seqs_and_ref_units.iter() {
debug!("LOCAL\tSAMPLE\t{}\t{}", chunk.id, units.len());
}
let mut hmm = kiley::hmm::guided::PairHiddenMarkovModel::default();
let mut hmm = kiley::hmm::PairHiddenMarkovModel::default();
let mut polishing_pairs: Vec<_> = seqs_and_ref_units
.iter()
.map(|(ref_unit, nodes)| {
Expand Down
1 change: 0 additions & 1 deletion haplotyper/src/multiplicity_estimation.rs
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,6 @@ impl MultiplicityEstimation for DataSet {
_ => old,
}
};
// graph.remove_lightweight_edges(thr, false);
debug!("SQUISHED\t{graph}");
use rand::SeedableRng;
use rand_xoshiro::Xoroshiro128PlusPlus;
Expand Down
4 changes: 2 additions & 2 deletions haplotyper/src/polish_segments.rs
Original file line number Diff line number Diff line change
Expand Up @@ -45,8 +45,8 @@ fn polish_segment(
seed: u64,
) -> Vec<u8> {
let config = crate::consensus::PolishConfig::new(seed, 2, 50, window_size, 50, 2);
let hmm = kiley::hmm::guided::PairHiddenMarkovModel::default();
let models = kiley::hmm::guided::PairHiddenMarkovModelOnStrands::new(hmm.clone(), hmm);
let hmm = kiley::hmm::PairHiddenMarkovModel::default();
let models = kiley::hmm::PairHiddenMarkovModelOnStrands::new(hmm.clone(), hmm);
warn!("TODO: Train parameters.");
crate::consensus::polish(sid, draft, alns, &models, &config)
}
Expand Down
Loading

0 comments on commit 91ba125

Please sign in to comment.