Skip to content

Commit

Permalink
Fix chromosome naming issue in BAM file
Browse files Browse the repository at this point in the history
  • Loading branch information
NMNS93 committed Jan 8, 2024
1 parent 2ef6b81 commit 6b80eea
Show file tree
Hide file tree
Showing 2 changed files with 23 additions and 7 deletions.
27 changes: 21 additions & 6 deletions R/allele_counts.R
Original file line number Diff line number Diff line change
Expand Up @@ -36,12 +36,6 @@ get_reads_in_segments <- function(bam_file, segments, min_mapq, paired_end = FAL
asMates <- FALSE
}

# Set segment chromosome names based on sequence style in BAM
# Required to read BAM correctly e.g. hg38 vs GRCh38 contigs
is_ucsc <- startsWith(seqnames(seqinfo(BamFile(bam_file)))[[1]], "chr")
bamstyle <- ifelse(is_ucsc, "UCSC", "NCBI")
GenomeInfoDb::seqlevelsStyle(segments) <- bamstyle

# Set parameters and read BamFile.
# ScanBamParam "whats" options are available via scanBamWhat()
param <- Rsamtools::ScanBamParam(which = segments, what = what, flag = flag, mapqFilter = min_mapq)
Expand All @@ -57,6 +51,7 @@ get_reads_in_segments <- function(bam_file, segments, min_mapq, paired_end = FAL
)

# Format BAM to ucsc format if not already
is_ucsc <- startsWith(seqnames(seqinfo(BamFile(bam_file)))[[1]], "chr")
if (!is_ucsc & nrow(bam_dt) > 0) {
bam_dt$rname <- paste0("chr", bam_dt$rname)
}
Expand Down Expand Up @@ -818,6 +813,19 @@ filter_multi_snp_loci <- function(pileup_summary) {
return(pileup_summary)
}

clean_and_limit_segments <- function(seg, bam_file) {
# Set segment chromosome names based on sequence style in BAM
# Required to read BAM correctly:
# e.g.: BAM having hg38 vs GRCh38 contigs
# e.g.: Segments having chromosomes not currently in BAM
bchroms <- seqnames(seqinfo(BamFile(bam_file)))
is_ucsc <- startsWith(bchroms[[1]], "chr")
bamstyle <- ifelse(is_ucsc, "UCSC", "NCBI")
GenomeInfoDb::seqlevelsStyle(seg) <- bamstyle
GenomeInfoDb::seqlevels(seg, pruning.mode="coarse") <- bchroms
return(seg)
}

# Wrapper ----

#' @export
Expand All @@ -831,6 +839,12 @@ cwrap_get_allele_counts <- function(bam_file, seg, loci_dt = NA, paired_end, dro
}
}

# Limit segments to chromosomes covered in BAM file
seg <- clean_and_limit_segments(seg, bam_file)
if(length(seg) == 0) {
return(empty_count_alleles_result())
}

# Read BAM and annotate SNP and CPG loci
bam_dt <- get_reads_in_segments(bam_file, seg, min_mapq, paired_end = paired_end)
if (nrow(bam_dt) == 0) {
Expand Down Expand Up @@ -883,3 +897,4 @@ cwrap_get_allele_counts <- function(bam_file, seg, loci_dt = NA, paired_end, dro

return(result)
}

3 changes: 2 additions & 1 deletion R/asm_allele_counts.R
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,8 @@ cwrap_asm_get_allele_counts <- function(
bam_dt <- annotate_bam_with_loci_asm(bam_dt, loci_dt, drop_ccgg, paired_end)

# Get qname to cpg mapping
qname_hap_cg <- unique(bam_dt[, .(qname, hap_id, chrom, start, end)])
qname_hap_cg <- unique(bam_dt[, .(qname, hap_id, chrom=chrom, start=start, end=end)])
qname_hap_cg$chrom = gsub("chr", "", qname_hap_cg$chrom)

# Split by ref and alt after filtering
ref_bam <- bam_dt[hap_is_ref == T]
Expand Down

0 comments on commit 6b80eea

Please sign in to comment.