Skip to content
This repository has been archived by the owner on Dec 7, 2021. It is now read-only.

Commit

Permalink
make bash code safer by quoting expansions and enforcing locales. Add…
Browse files Browse the repository at this point in the history
… utility script for coverage per sample.
  • Loading branch information
cmdoret committed Aug 2, 2019
1 parent d487439 commit bbbe4d9
Show file tree
Hide file tree
Showing 3 changed files with 49 additions and 23 deletions.
29 changes: 15 additions & 14 deletions src/convert_coord/apply_bp2cm.sh
Original file line number Diff line number Diff line change
Expand Up @@ -2,15 +2,16 @@
# Transforms basepairs into cM in the case_control_all.tsv file
# from the association mapping using the output of bp2cm.sh
# Assumes both files are sorted by ascending Chr, BP
# TODO: distance between pairs of SNPs needs to be ratio*D
# Where D is the distance between a SNP and end of interval.
# If SNPs on different intervals, compute D per interval and sum them
# TODO: increment cumulative sum after every pair of SNP to get actual cM unit.

# shellcheck disable=SC1091
source src/misc/jobs_manager.sh
in_hits="$1"
bp2cm="$2"
out_hits="$3"

# Enforce US locales to keep dot as decimal separator
LC_NUMERIC=en_US.UTF-8

# Sort hits by position
hits="$in_hits.sorted"
head -n 1 "$in_hits" > "$hits"
Expand All @@ -20,36 +21,36 @@ sort -k2,2 -k3,3n <(tail -n+2 "$in_hits") >> "$hits"
cum_cM=0
#Start at line 2 to skip header
n_interv=1
echo -e "$(head -n 1 "$hits")\tcM" > $out_hits
echo -e "$(head -n 1 "$hits")\tcM" > "$out_hits"

prev_chr=$(head -n 1 "$hits" | cut -f2)
# Loop over association mapping hits
while read -a hit; do
while read -ra hit; do
((i++))
#prettyload "$i" $(wc -l "$hits" | awk '{print $1}')
chr=${hit[1]}
bp=${hit[2]}

# Safety to reset the cM count on new chrom
# works even if hit and interval changed chrom at the same time
if [ $chr != $prev_chr ];then
if [ "$chr" != "$prev_chr" ];then
cum_cM=0
fi
# Loop over linkage intervals
# Stop reading if interval starts after hit on same chrom
while read -a interv && ( ( [ ${interv[0]} == $chr ] && \
[ ${interv[1]} -le $bp ] ) || \
[ ${interv[0]} \< $chr ] ); do
while read -a interv && ( ( [ "${interv[0]}" == "$chr" ] && \
[ "${interv[1]}" -le "$bp" ] ) || \
[ "${interv[0]}" \< "$chr" ] ); do

# If on different chrom, shift interval and reset cM value
if [ ${interv[0]} \< $chr ];then
if [ "${interv[0]}" \< "$chr" ];then
cum_cM=0
((n_interv++))
continue
fi

# If hit is inside segment (i.e. before its end)
if [ $bp -le ${interv[2]} ]; then
if [ "$bp" -le "${interv[2]}" ]; then
# Position of SNP from start of interval
from_start=$((bp - interv[1]))
curr_cM=$(echo "$from_start * ${interv[3]} + $cum_cM" | bc -l)
Expand All @@ -64,10 +65,10 @@ while read -a hit; do
echo "$cum_cM"

# Only check intervals from n_inter to the end
done < <(sed -n "${n_interv},\$p" $bp2cm)
done < <(sed -n "${n_interv},\$p" "$bp2cm")

prev_chr=$chr
done < <( tail -n+2 $hits)
done < <( tail -n+2 "$hits")

rm "$hits"
sed 's/ / /g' "$out_hits" > "$out_hits.tmp" && mv "$out_hits.tmp" "$out_hits"
21 changes: 12 additions & 9 deletions src/convert_coord/bp2cm.sh
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
usage(){

cat <<USAGE
`basename $0` [OPTION]...
$(basename "$0") [OPTION]...
Compute cM/BP ratio between all markers in a linkage map and convert coordinates to an anchored assembly.
-n, --new path to the new (anchored) assembly fasta file.
-r, --old_ref path to the old (non-anchored) assembly fasta file.
Expand All @@ -33,14 +33,17 @@ case $key in
esac
done

# Enforce english numerals in script to keep point decimal separator
LC_NUMERIC=en_US.UTF-8

# Check if any argument is missing
if [ -z "$NREF" ]; then miss="New assembly"; fi
if [ -z "$OREF" ]; then miss="Old assembly"; fi
if [ -z "$MARK" ]; then miss="Markers list"; fi
if [ -z "$OUTF" ]; then miss="Output path"; fi

# Signal missing argument, print help and exit
if [ ! -z ${miss+x} ];then echo -e "\033[31;1m Error: \033[m $miss not provided."; usage; fi
if [ -n "${miss+x}" ];then echo -e "\033[31;1m Error: \033[m $miss not provided."; usage; fi

mkdir -p "$(dirname $OUTF)"
# Compute coordinates of markers in new assembly and store correspondance list to file
Expand All @@ -51,7 +54,7 @@ if [ -z ${PRES+x} ] || [ ! -f "$OUTF.corresp" ]; then
# remove useless cols | make composite chr-bp field |
# join on composite field to add cM info
python2 src/convert_coord/contig2chr.py "$OREF" "$NREF" \
--region_size 5000 --include_input < $MARK \
--region_size 5000 --include_input < "$MARK" \
| cut -d ',' -f1-4 \
| awk -v FS=',' -v OFS=',' '{print $1"-"$2,$0}' \
| sort -t ',' -k1,1 \
Expand Down Expand Up @@ -85,11 +88,11 @@ awk 'BEGIN{FS="\t";OFS="\t"}

prev=(0 0 0 0 0)
rm -f "$OUTF.tsv"
while read -a marker; do
while read -ra marker; do
# If still on same contig, compute bp/cM ratio between previous and current marker
if [ ${prev[0]} == ${marker[0]} ] || [ ${prev[0]} == 0 ]; then
if [ "${prev[0]}" == "${marker[0]}" ] || [ "${prev[0]}" == 0 ]; then
cM=$(echo "${marker[2]} - ${prev[2]}" | bc -l)
bp=$(( ${marker[4]} - ${prev[4]} ))
bp=$(( "${marker[4]}" - "${prev[4]}" ))
#echo "${marker[@]}"
ratio=$(echo "$cM / $bp" | bc -l | xargs printf "%.10f\n")
# Ignore SNP if BP order is wrong (i.e. diminishing cM) or unreasonably high ratio.
Expand All @@ -101,7 +104,7 @@ while read -a marker; do

else
# if still on same chromosome, use mean chrom. ratio to estimate inter-contig ratio
if [ ${marker[3]} == ${prev[3]} ]; then
if [ "${marker[3]}" == "${prev[3]}" ]; then
ratio=$(grep "${marker[3]}" "$OUTF.cMean" | cut -f2)
else
# New chromosome. No need to subtract (previous is 0)
Expand All @@ -110,8 +113,8 @@ while read -a marker; do
fi
fi
# send chromosome, interval between previous and current markers in BP, and cM/BP ratio in interval
echo -e "${marker[3]}\t$((${prev[4]}+1))\t${marker[4]}\t$ratio" >> "$OUTF.tsv"
prev=(${marker[@]})
echo -e "${marker[3]}\t$((prev[4]+1))\t${marker[4]}\t$ratio" >> "$OUTF.tsv"
prev=("${marker[@]}")

# TODO: Eliminate markers higher than a threshold (cM/BP > 1 maybe ?)
done < "$OUTF.corresp"
22 changes: 22 additions & 0 deletions src/misc/cov_per_sample.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
#!/bin/bash

GENOMESIZE=140734580
READLENGTH=93
REPORT="sample_coverage.tsv"

echo -e "sample\tcoverage" > "$REPORT"
n_samples=$(ls -1 data/mapped/aln-4/bam/*bam | wc -l)
declare -i n_done=0
for sample_bam in data/mapped/aln-4/bam/*bam; do
echo -e "Processed $n_done / $n_samples"
COVERAGE=$(samtools depth -a "$sample_bam" \
| awk -v gsize="$GENOMESIZE" -v rlen="$READLENGTH" '{depth+=$3}END{print rlen*depth/gsize}')
sample_name="$(basename $sample_bam)"
echo -ne "${sample_name%%-BWA*}\t" >> "$REPORT"
echo "$COVERAGE" >> "$REPORT"
((n_done++))
done
# Merge coverage file with sample metadata
join -j 1 \
<( tail -n+2 sample_coverage.tsv | sort -k1,1) \
<( tail -n +2 data/ploidy/thresholds/fixed.tsv | sort -k1,1)

0 comments on commit bbbe4d9

Please sign in to comment.