Skip to content

Commit

Permalink
Added new granges windows subcommand, fixed bug in from_windows().
Browse files Browse the repository at this point in the history
 - Fixed window behavior to match `bedtools makewindows`.
 - Fixed tests.
 - New test utility functions for generating random BED5 files.
 - New validation against `bedtools makewindows`.
  • Loading branch information
vsbuffalo committed Feb 25, 2024
1 parent 3296995 commit 3e22670
Show file tree
Hide file tree
Showing 6 changed files with 243 additions and 37 deletions.
37 changes: 29 additions & 8 deletions src/commands.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ use crate::{
prelude::*,
ranges::{operations::adjust_range, GenomicRangeEmptyRecord, GenomicRangeRecord},
reporting::{CommandOutput, Report},
test_utilities::random_granges,
test_utilities::{random_granges, random_granges_mock_bed5},
traits::TsvSerialize,
Position, PositionOffset,
};
Expand Down Expand Up @@ -469,23 +469,44 @@ pub fn granges_map(
Ok(CommandOutput::new((), report))
}

/// Generate a BED3 file of genomic windows.
pub fn granges_windows(
seqlens: impl Into<PathBuf>,
width: Position,
step: Option<Position>,
chop: bool,
output: Option<impl Into<PathBuf>>,
) -> Result<CommandOutput<()>, GRangesError> {
let genome = read_seqlens(seqlens)?;
GRangesEmpty::from_windows(&genome, width, step, chop)?.to_tsv(output, &BED_TSV)?;
let report = Report::new();
Ok(CommandOutput::new((), report))
}

/// Generate a random BED-like file with genomic ranges.
pub fn granges_random_bed(
seqlens: impl Into<PathBuf>,
num: usize,
output: Option<impl Into<PathBuf>>,
sort: bool,
bed5: bool,
) -> Result<CommandOutput<()>, GRangesError> {
// get the genome info
let genome = read_seqlens(seqlens)?;

let mut gr = random_granges(&genome, num)?;

if sort {
gr = gr.sort();
}

gr.to_tsv(output, &BED_TSV)?;
if bed5 {
let mut gr = random_granges_mock_bed5(&genome, num)?;
if sort {
gr = gr.sort()
}
gr.to_tsv(output, &BED_TSV)?;
} else {
let mut gr = random_granges(&genome, num)?;
if sort {
gr = gr.sort();
}
gr.to_tsv(output, &BED_TSV)?;
};

let report = Report::new();
Ok(CommandOutput::new((), report))
Expand Down
63 changes: 43 additions & 20 deletions src/granges.rs
Original file line number Diff line number Diff line change
Expand Up @@ -258,14 +258,14 @@ impl<'a, R: IterableRangeContainer> GenomicRangesTsvSerialize<'a, R> for GRanges
/// will be a BED3 file.
///
/// # Arguments
/// * `output`: either `None` (for standard out) or file path. If the filepath
/// ends in `.gz`, the output will be gzip-compressed.
/// * `output`: either `None` (for standard out) or file path.
/// * `config`: a [`TsvConfig`], which contains the TSV output settings.
fn to_tsv(
&'a self,
output: Option<impl Into<PathBuf>>,
config: &TsvConfig,
) -> Result<(), GRangesError> {
// TODO gzip output handling
// output stream -- header is None for now (TODO)
let output = output.map_or(OutputStream::new_stdout(None), |file| {
OutputStream::new(file, None)
Expand Down Expand Up @@ -435,29 +435,26 @@ impl GRangesEmpty<VecRangesEmpty> {
) -> Result<GRangesEmpty<VecRangesEmpty>, GRangesError> {
let mut gr: GRangesEmpty<VecRangesEmpty> = GRangesEmpty::new_vec(seqlens);

// have we encountered a remainder chunk?

let mut remainder = false;
// iterate over each chromosome and create windows
for (seqname, len) in seqlens {
let mut start = 0;
while start < *len {
let mut end = start + width - 1;
let mut end = start + width;

if end >= *len {
// the end is past the sequence length
if chop {
// do not add any remainder
break;
} else {
end = std::cmp::min(end, len - 1);
}
}
if end - start + 1 < width {
if remainder {
break;
// truncate end, push, and break
end = std::cmp::min(end, *len);
gr.push_range(seqname, start, end)?;
}
remainder = true;
} else {
// push a normal window
gr.push_range(seqname, start, end)?;
}
gr.push_range(seqname, start, end + 1)?;
start += step.unwrap_or(width);
}
}
Expand Down Expand Up @@ -1386,31 +1383,34 @@ mod tests {

#[test]
fn test_from_windows() {
let sl = seqlens!( "chr1" => 35);
let sl = seqlens!( "chr1" => 35 );

// Test chop
let gr = GRangesEmpty::from_windows(&sl, 10, None, true).unwrap();
assert_eq!(gr.len(), 3, "{:?}", gr);
dbg!(&gr);
gr.iter_ranges().for_each(|r| assert_eq!(r.width(), 10));

// Test no chop
let gr = GRangesEmpty::from_windows(&sl, 10, None, false).unwrap();
assert_eq!(gr.iter_ranges().last().unwrap().width(), 5);

// Test sliding with step of 2, with chop
let sl = seqlens!( "chr1" => 21);
let sl = seqlens!( "chr1" => 21, "chr2" => 13 );
let gr = GRangesEmpty::from_windows(&sl, 10, Some(2), true).unwrap();
let mut expected_ranges_chop: Vec<(String, Position, Position)> = vec![
let expected_ranges_chop: Vec<(String, Position, Position)> = vec![
("chr1", 0, 10),
("chr1", 2, 12),
("chr1", 4, 14),
("chr1", 6, 16),
("chr1", 8, 18),
("chr1", 10, 20),
("chr2", 0, 10),
("chr2", 2, 12),
]
.into_iter()
.map(|(seq, s, e)| (seq.to_string(), s, e))
.collect();

let seqnames = sl.keys().map(|x| x.to_string()).collect::<Vec<_>>();
let actual_ranges: Vec<(String, Position, Position)> = gr
.iter_ranges()
Expand All @@ -1420,13 +1420,36 @@ mod tests {

// The same as above, without chop -- goes to seqlen with little remainder.
let gr = GRangesEmpty::from_windows(&sl, 10, Some(2), false).unwrap();
expected_ranges_chop.push(("chr1".to_string(), 12, 21));
let expected_ranges_no_chop: Vec<(String, Position, Position)> = vec![
("chr1", 0, 10),
("chr1", 2, 12),
("chr1", 4, 14),
("chr1", 6, 16),
("chr1", 8, 18),
("chr1", 10, 20),
("chr1", 12, 21),
("chr1", 14, 21),
("chr1", 16, 21),
("chr1", 18, 21),
("chr1", 20, 21),
("chr2", 0, 10),
("chr2", 2, 12),
("chr2", 4, 13),
("chr2", 6, 13),
("chr2", 8, 13),
("chr2", 10, 13),
("chr2", 12, 13),
]
.into_iter()
.map(|(seq, s, e)| (seq.to_string(), s, e))
.collect();

let actual_ranges: Vec<(String, Position, Position)> = gr
.iter_ranges()
.map(|r| (r.seqname(&seqnames), r.start(), r.end()))
.collect();

assert_eq!(actual_ranges, expected_ranges_chop);
assert_eq!(actual_ranges, expected_ranges_no_chop);
}

#[test]
Expand Down
4 changes: 4 additions & 0 deletions src/io/parsers.rs
Original file line number Diff line number Diff line change
Expand Up @@ -427,6 +427,10 @@ impl TsvSerialize for Option<Strand> {
}

/// The additional two BED5 columns.
///
/// # Fields
/// * `name`: the feature name.
/// * `score`: a score.
#[derive(Clone, Debug)]
pub struct Bed5Addition {
pub name: String,
Expand Down
82 changes: 75 additions & 7 deletions src/main/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,9 @@ use std::path::PathBuf;

use clap::{Parser, Subcommand};
use granges::{
commands::{granges_adjust, granges_filter, granges_flank, granges_map, ProcessingMode},
commands::{
granges_adjust, granges_filter, granges_flank, granges_map, granges_windows, ProcessingMode,
},
data::operations::Operation,
prelude::GRangesError,
Position, PositionOffset,
Expand All @@ -17,12 +19,27 @@ usage: granges [--help] <subcommand>
Subcommands:
adjust: Adjust each genomic range, e.g. to add a kilobase to each end.
adjust: Adjust each genomic range, e.g. to add a kilobase to each end.
filter: Filter the left ranges based on whether they have at least one
overlap with a right range. This is equivalent to a filtering
"semi-join" in SQL terminology.
filter: Filter the left ranges based on whether they have at least one
overlap with a right range. This is equivalent to a filtering
"semi-join" in SQL terminology.
map: Compute the left grouped overlaps between the left genomic ranges
and right genomic ranges, and apply one or more operations to the
score column of the right BED5 file.
windows: Create a set of genomic windows of the specified width (in basepairs),
stepping the specified step size (the width, by default).
NOTE: granges is under active development. It is not currently meant to be
a full replacement for other genomic ranges software, such as bedtools. The
command line functionality currently used for testing and benchmarking.
Please request features and report any issues on GitHub:
https://github.com/vsbuffalo/granges/issues
"#;

#[derive(Parser)]
Expand All @@ -38,6 +55,8 @@ struct Cli {

#[derive(Subcommand)]
enum Commands {
/// Adjust the start, end, or both coordinates of each range by some
/// specified amount.
Adjust {
/// A TSV genome file of chromosome names and their lengths
#[arg(short, long, required = true)]
Expand All @@ -60,6 +79,8 @@ enum Commands {
sort: bool,
// TODO add skip_missing here
},
/// Filter out the left ranges that do not have overlaps with any
/// right ranges. This is a "semi-join" in SQL terminology.
Filter {
/// A TSV genome file of chromosome names and their lengths
#[arg(short, long, required = true)]
Expand All @@ -82,6 +103,7 @@ enum Commands {
#[arg(short, long)]
skip_missing: bool,
},
/// Compute the flanking regions for each range.
Flank {
/// A TSV genome file of chromosome names and their lengths
#[arg(short, long, required = true)]
Expand Down Expand Up @@ -116,6 +138,11 @@ enum Commands {
#[arg(long)]
in_mem: bool,
},
/// Do a "left grouped join", on the specified left and right genomic ranges,
/// and apply one or more functions to the BED5 scores for all right genomic
/// ranges.
///
/// This is analogous to 'bedtools map'.
Map {
/// A TSV genome file of chromosome names and their lengths
#[arg(short, long, required = true)]
Expand All @@ -142,6 +169,36 @@ enum Commands {
#[arg(short, long)]
skip_missing: bool,
},
/// Create a set of genomic windows ranges using the specified width
/// and step size, and output to BED3.
///
/// If --chop is set, the "remainder" windows at the end of a chromosome
/// that would have width less than that specified by --width are chopped
/// off.
///
/// This is analogous to 'bedtools makewindows'.
Windows {
/// A TSV genome file of chromosome names and their lengths
#[arg(short, long, required = true)]
genome: PathBuf,

/// Width (in basepairs) of each window.
#[arg(short, long)]
width: Position,

/// Step width (by default: window size).
#[arg(short, long)]
step: Option<Position>,

/// If last window remainder is shorter than width, remove?
#[arg(short, long)]
chop: bool,

/// An optional output file (standard output will be used if not specified)
#[arg(short, long)]
output: Option<PathBuf>,
},

#[cfg(feature = "dev-commands")]
RandomBed {
/// a TSV genome file of chromosome names and their lengths
Expand All @@ -159,6 +216,10 @@ enum Commands {
/// sort the ranges
#[arg(short, long)]
sort: bool,

/// add two additional columns (feature name and score) to mimic a BED5 file
#[arg(short, long)]
bed5: bool,
},
}

Expand Down Expand Up @@ -240,14 +301,21 @@ fn run() -> Result<(), GRangesError> {
*skip_missing,
)
}

Some(Commands::Windows {
genome,
width,
step,
chop,
output,
}) => granges_windows(genome, *width, *step, *chop, output.as_ref()),
#[cfg(feature = "dev-commands")]
Some(Commands::RandomBed {
genome,
num,
output,
sort,
}) => granges_random_bed(genome, *num, output.as_ref(), *sort),
bed5,
}) => granges_random_bed(genome, *num, output.as_ref(), *sort, *bed5),
None => {
println!("{}\n", INFO);
std::process::exit(1);
Expand Down
Loading

0 comments on commit 3e22670

Please sign in to comment.