This repository has been archived by the owner on Dec 7, 2021. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 1
/
Makefile
182 lines (153 loc) · 5.24 KB
/
Makefile
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
include config.mk
# LOCAL must be defined from command line if running analysis locally
# e.g: make collinearity LOCAL=yes
ifdef $(LOCAL)
LOCAL='$(LOCAL)';
endif
##########################################
#### 1. RADseq processing with STACKS ####
##########################################
# reference-based STACKS pipeline for LSF cluster (default)
ref_lsf:
make -f src/pipelines/Makeref.lsf
# reference based STACKS running locally (without LSF)
.PHONY : local
ref_local:
make -f src/pipelines/Makeref.local
##############################
#### 2. PLOIDY SEPARATION ####
##############################
# This rule is used to split haploid and diploid males.
# This has already been done with the dataset provided
# under stringent parameters (D=25 in STACKS populations)
.PHONY : ploidy
ploidy:
# Parsing VCF file from populations output
mkdir -p $(DAT)/ploidy/thresholds
bash $(MISC)/parse_VCF.sh $(POP) $(DAT)/ploidy/vcftools
# Building list of haploid males
python2 src/ploidy/haplo_males.py $(VCFSUM) \
$(THRESH) \
--sample_list $(DAT)/individuals.tsv \
--ploidy_thresh $(HOM_PLOID)
# Processing populations genomic output (excluding hom/missing SNPs in
# mothers from their families)
python2 $(ASSOC-SRC)/process_genomic.py $(POP) \
$(DAT)/ploidy/
# Blacklisting loci that are heterozygous in haploid males
python2 src/ploidy/blacklist_haploloci.py $(DAT)/ploidy/ \
$(BLACK) \
$(THRESH)
mkdir -p data/SNP_lists
################################
#### 3. ASSOCIATION MAPPING ####
################################
# Association mapping
.PHONY : assoc_mapping
assoc_mapping : $(CENTRO) $(SIZES) $(ASSOC)/grouped_prophom.tsv
# Creating folder to store association mapping results
mkdir -p $(ASSOC)/plots
mkdir -p $(ASSOC)/hits
# Genome-wide association mapping
mkdir -p $(ASSOC)/case_control
Rscript $(ASSOC-SRC)/case_control.R \
$(ASSOC)/grouped_outpool_keep_prophom.tsv \
$(ASSOC)/case_control/ \
$(SIZES)
# Centromere identification
$(CENTRO) :
mkdir -p $(CENTRO)/plots
# Processing "genomic" output from populations to get fixed and variant sites
python2 $(ASSOC-SRC)/process_genomic.py $(POP) $(ASSOC) \
--pool_output
# Inferring centromere position based on recombination raes along chromosomes
Rscript $(ASSOC-SRC)/chrom_types.R $(ASSOC)/grouped_outpool_prophom.tsv \
$(DAT)/individuals.tsv \
$(CENTRO)
# Processing STACKS "genomic" output for association mapping
$(ASSOC)/grouped_prophom.tsv :
python2 $(ASSOC-SRC)/process_genomic.py $(POP) $(ASSOC) \
--keep_all --pool_output
# Store sizes of all contigs in text file
$(SIZES):
awk '$$0 ~ ">" {print c; c=0;printf substr($$0,2,100) "\t"; } \
$$0 !~ ">" {c+=length($$0);} END { print c; }' $(REF) > $@
##################################
#### 4. COLLINEARITY ANALYSIS ####
##################################
# Preparing input file for collinearity analysis. Run MCScanX on files manually
.PHONY : collinearity
collinearity : $(RNA)/assembled/ $(CORRESP)
rm -rf $(MCSX-IN)/
mkdir -p $(MCSX-IN)/
bash $(MCSX-SRC) -g $(RNA)/assembled/transcripts.gtf \
-o $(MCSX-IN) \
-r $(REF) \
-c $(CORRESP) \
$${LOCAL:+-l}
bash src/circos_conf/circos_input_gen.sh $(REF)
# Assembling transcripts and measuring coverage along genome
$(RNA)/assembled/ :
bash $(RNA-SRC) -a $(BAM) \
-r $(OLD-REF) \
-o $@ \
$${LOCAL:+-l}
# Contig correspondance file between old (unanchored)
# and new (anchored) assembly
$(CORRESP):
bash src/convert_coord/corresp_contigs.sh -O $(OLD-REF) \
-g $(LOG) \
-N $(REF) \
-c $(CORRESP) \
$${LOCAL:+-l} \
2> $(LOG)/corresp.log
#########################################
#### 5. ANALYSIS OF WILD WGS SAMPLES ####
#########################################
$(WGS)/mapped : $(WGS)/raw
bash src/wgs_wild/bwa_wgs.sh -d $(WGS) \
-r $(REF) \
$${LOCAL:+-l}
$(WGS)/variant/hap.wild.matrix : $(WGS)/mapped
bash src/wgs_wild/snps_wgs.sh -d $(WGS) \
-r $(REF) \
$${LOCAL:+-l}
.PHONY : wgs_qc
wgs_qc : $(WGS)/mapped $(WGS)/raw
bash src/wgs_wild/qc_gen.sh -d $(WGS) \
-r $(REF) \
$${LOCAL:+-l}
.PHONY : wgs_wild
wgs_wild : $(CORRESP) $(SIZES) $(WGS)/variant/hap.wild.matrix
# Compute PI diversity in sliding windows
Rscript src/wgs_wild/compute_PI.R \
--in $(WGS)/variant/hap.wild.matrix.txt \
--out $(WGS)/stats/win_w100_t10_PI.tsv \
--mode 'window' \
--step_size 10 \
--win_size 100
# Compute PI diversity per site
Rscript src/wgs_wild/compute_PI.R \
--in $(WGS)/variant/hap.wild.matrix.txt \
--out $(WGS)/stats/sites_PI.tsv \
--mode 'site'
####################
#### MISC RULES ####
####################
# Transform basepair positions for SNPs from the association mapping into
# centiMorgan.
.PHONY: bp2cm
bp2cm:
# Compute coordinates of linkage map markers in new genome
# Compute cM/BP ratio in each inter-marker interval
bash src/convert_coord/bp2cm.sh -r $(OLD-REF) -n $(REF) -o $(LINKMAP)/bp2cm/bp2cm \
-m $(LINKMAP)/linkage_markers.csv
# Apply transformation to CSD dataset to get cM coordinates
bash src/convert_coord/apply_bp2cm.sh $(ASSOC)/case_control/case_control_all.tsv \
$(LINKMAP)/bp2cm/bp2cm.tsv \
$(LINKMAP)/bp2cm/bp2cm_csd_snps.tsv
# Check genome completeness using BUSCO
.PHONY : busco
busco :
bash src/misc/genome_completeness.sh -r $(REF) \
-d data/ref_genome/busco/