-
Notifications
You must be signed in to change notification settings - Fork 0
/
cdsfile2rbh.R
659 lines (658 loc) · 27.9 KB
/
cdsfile2rbh.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
#' @title cdsfile2rbh
#' @name cdsfile2rbh
#' @description This function calculates (conditional-)reciprocal best hit
#' (CRBHit) pairs from two CDS fasta files.
#' CRBHit pairs were introduced by \emph{Aubry S, Kelly S et al. (2014)}.
#' Sequence searches are performed with \bold{last}
#' \emph{Kiełbasa, SM et al. (2011)} [default]
#' or with \bold{mmseqs2}
#' \emph{Steinegger, M and Soeding, J (2017)}
#' or with \bold{diamond}
#' \emph{Buchfink, B et al. (2021)}.
#' If one specifies cdsfile1 and cdsfile2 as the same input a selfblast is
#' conducted.
#' @param cdsfile1 cds1 fasta file [mandatory]
#' @param cdsfile2 cds2 fasta file [mandatory]
#' @param dbfile1 aa1 db file [optional]
#' @param dbfile2 aa2 db file [optional]
#' @param searchtool specify sequence search algorithm last, mmseqs2, diamond or
#' lambda3
#' [default: last]
#' @param lastpath specify the PATH to the last binaries
#' [default: /extdata/last-1595/bin/]
#' @param lastD last option D: query letters per random alignment
#' [default: 1e6]
#' @param lastm last option m: maximum initial matches per query position
#' [default: 10]
#' @param mmseqs2path specify the PATH to the mmseqs2 binaries
#' [default: NULL]
#' @param mmseqs2sensitivity specify the sensitivity option of mmseqs2
#' [default: 5.7]
#' @param mmseqs2maxseqs mmseqs2 option: Maximum results per query sequence
#' allowed to pass the prefilter
#' [default: 300]
#' @param diamondpath specify the PATH to the diamond binaries
#' [default: NULL]
#' @param diamondsensitivity specify the sensitivity option of diamond
#' [default: --sensitive]
#' @param diamondmaxtargetseqs specify the maximum number of target sequences
#' per query option of diamond
#' [default: 0]
#' @param lambda3path specify the PATH to the lambda3 binaries
#' [default: NULL]
#' @param lambda3sensitivity specify the sensitivity option of lambda3
#' [default: sensitive]
#' @param lambda3nummatches specify the number of matches per query option of
#' lambda3
#' [default: 25]
#' @param outpath specify the output PATH [default: /tmp]
#' @param crbh specify if conditional-reciprocal hit pairs should be retained
#' as secondary hits [default: TRUE]
#' @param keepSingleDirection specify if single direction secondary hit pairs
#' should be retained [default: FALSE]
#' @param eval evalue [default: 1e-3]
#' @param qcov query coverage [default: 0.0]
#' @param tcov target coverage [default: 0.0]
#' @param pident percent identity [default: 0.0]
#' @param alnlen alignment length [default: 0.0]
#' @param rost1999 specify if hit pairs should be filter by equation 2 of
#' Rost 1999 [default: FALSE]
#' @param filter specify additional custom filters as list to be applied on
#' hit pairs [default: NULL]
#' @param plotCurve specify if crbh fitting curve should be plotted
#' [default: FALSE]
#' @param fit.type specify if mean or median should be used for fitting
#' [default: mean]
#' @param fit.varweight factor for fitting function to consider neighborhood
#' [default: 0.1]
#' @param fit.min specify minimum neighborhood alignment length [default: 5]
#' @param longest.isoform specify if cds sequences should be removed to the
#' longest isoform (only possible if data was accessed from NCBI or ENSEMBL)
#' [default: FALSE]
#' @param isoform.source specify cds sequences source (either NCBI or ENSEMBL)
#' [default: NCBI]
#' @param threads number of parallel threads [default: 1]
#' @param remove specify if last result files should be removed [default: TRUE]
#' @param remove.db specify if last db files should be removed [default: TRUE]
#' @param shorten1 shorten all sequences to multiple of three [default: FALSE]
#' @param frame1 indicates the first base of the first codon [default: 1]
#' @param framelist1 supply vector of frames for each entry [default: NULL]
#' @param genetic.code1 The genetic code to use for the translation of codons
#' into Amino Acid letters [default: NULL]
#' @param shorten2 shorten all sequences to multiple of three [default: FALSE]
#' @param frame2 indicates the first base of the first codon [default: 1]
#' @param framelist2 supply vector of frames for each entry [default: NULL]
#' @param genetic.code2 The genetic code to use for the translation of codons
#' into Amino Acid letters [default: NULL]
#' @return List of three (crbh=FALSE)\cr
#' 1: $crbh.pairs\cr
#' 2: $crbh1 matrix; query > target\cr
#' 3: $crbh2 matrix; target > query\cr
#' \cr
#' List of four (crbh=TRUE)\cr
#' 1: $crbh.pairs\cr
#' 2: $crbh1 matrix; query > target\cr
#' 3: $crbh2 matrix; target > query\cr
#' 4: $rbh1_rbh2_fit; evalue fitting function
#' @importFrom Biostrings DNAString DNAStringSet AAString AAStringSet
#' readDNAStringSet readAAStringSet writeXStringSet width subseq
#' @importFrom graphics legend par points
#' @importFrom stats median splinefun
#' @importFrom utils read.table
#' @importFrom tidyr %>%
#' @importFrom MSA2dist cds2aa
#' @seealso \code{\link[CRBHits]{cds2rbh}},
#' \code{\link[CRBHits]{isoform2longest}}
#' @references Aubry S, Kelly S et al. (2014) Deep Evolutionary Comparison of
#' Gene Expression Identifies Parallel Recruitment of Trans-Factors in Two
#' Independent Origins of C4 Photosynthesis. \emph{PLOS Genetics},
#' \bold{10(6)} e1004365.
#' @references Kiełbasa, SM et al. (2011) Adaptive seeds tame genomic sequence
#' comparison. \emph{Genome research}, \bold{21(3)}, 487-493.
#' @references Steinegger, M and Soeding, J. (2017). MMseqs2 enables sensitive
#' protein sequence searching for the analysis of massive data sets.
#' \emph{Nature Biotechnology}, \bold{35}, 1026-1028.
#' @references Rost B. (1999). Twilight zone of protein sequence alignments.
#' \emph{Protein Engineering}, \bold{12(2)}, 85-94.
#' @examples
#' ## compile last-1595 within CRBHits
#' CRBHits::make_last()
#' ## load example sequence data
#' athfile <- system.file("fasta", "ath.cds.fasta.gz", package="CRBHits")
#' alyfile <- system.file("fasta", "aly.cds.fasta.gz", package="CRBHits")
#' ## get CRBHit pairs
#' ath_aly_crbh <- cdsfile2rbh(
#' cdsfile1=athfile,
#' cdsfile2=alyfile,
#' plotCurve=TRUE)
#' dim(ath_aly_crbh$crbh.pairs)
#' ## get classical reciprocal best hit (RBHit) pairs
#' ath_aly_rbh <- cdsfile2rbh(
#' cdsfile1=athfile,
#' cdsfile2=alyfile,
#' crbh=FALSE)
#' dim(ath_aly_rbh$crbh.pairs)
#' ## selfblast
#' ath_selfblast_crbh <- cdsfile2rbh(
#' cdsfile1=athfile,
#' cdsfile2=athfile,
#' plotCurve=TRUE)
#' ## see ?cds2rbh for more examples
#' @export cdsfile2rbh
#' @author Kristian K Ullrich
cdsfile2rbh <- function(cdsfile1, cdsfile2,
dbfile1=NULL,
dbfile2=NULL,
searchtool="last",
lastpath=paste0(find.package("CRBHits"),
"/extdata/last-1595/bin/"),
lastD=1e6,
lastm=10,
mmseqs2path=NULL,
mmseqs2sensitivity=5.7,
mmseqs2maxseqs=300,
diamondpath=NULL,
diamondsensitivity="--sensitive",
diamondmaxtargetseqs=0,
lambda3path=NULL,
lambda3sensitivity="sensitive",
lambda3nummatches=25,
outpath="/tmp",
crbh=TRUE,
keepSingleDirection=FALSE,
eval=1e-3,
qcov=0.0,
tcov=0.0,
pident=0.0,
alnlen=0.0,
rost1999=FALSE,
filter=NULL,
plotCurve=FALSE,
fit.type="mean",
fit.varweight=0.1,
fit.min=5,
longest.isoform=FALSE,
isoform.source="NCBI",
threads=1,
remove=TRUE,
remove.db=TRUE,
shorten1=FALSE,
frame1=1,
framelist1=NULL,
genetic.code1=NULL,
shorten2=FALSE,
frame2=1,
framelist2=NULL,
genetic.code2=NULL
){
#internal function to fit evalue by length
fitSpline <- function(alnlength, evalue, fit.type, fit.varweight, fit.min){
log10evalue <- -log10(evalue)
log10evalue[is.infinite(log10evalue)] <- 324
x <- cbind(alnlength, log10evalue)
x <- x[order(x[,1]),]
x.max <- max(x[,1])
fitMatrix <- matrix(0, ncol=2, nrow=x.max)
for(i in seq(from=1, to=x.max)){
fitMatrix[i, 1] <- i
s <- round(i * fit.varweight)
if(s<fit.min){s <- fit.min}
smin <- i - s
smax <- i + s
s.idx <- which(x[,1]>=smin & x[,1]<=smax)
if(length(s.idx)==0){s.value <- 0}
if(length(s.idx)!=0){
if(fit.type=="mean"){
s.value <- mean(x[s.idx, 2])
}
if(fit.type=="median"){
s.value <- median(x[s.idx, 2])
}
}
if(i==1){
fitMatrix[i, 2] <- s.value
}
if(i>1){
if(fitMatrix[i-1, 2]<=s.value){
fitMatrix[i, 2] <- s.value
}
if(fitMatrix[i-1, 2]>s.value){
fitMatrix[i, 2] <- fitMatrix[i-1, 2]
}
}
}
fitMatrixfun <- splinefun(fitMatrix[,1], fitMatrix[,2])
return(fitMatrixfun)
}
if(searchtool=="last"){
if(!dir.exists(lastpath)){
stop("Error: last PATH does not exist. Please specify correct
PATH and/or look into package installation prerequisites.
Try to use make_last() function.")
}
if(!file.exists(paste0(lastpath, "lastdb"))){
stop("Error: lastdb binary does not exist. Please specify
correct PATH and/or look into package installation
prerequisites. Try to use make_last() function.")
}
if(!file.exists(paste0(lastpath, "lastal"))){
stop("Error: lastal binary does not exist. Please specify
correct PATH and/or look into package installation
prerequisites. Try to use make_last() function.")
}
}
if(searchtool=="mmseqs2"){
if(!dir.exists(mmseqs2path)){
stop("Error: mmseqs2 PATH does not exist. Please specify
correct PATH and/or look into package installation
prerequisites.")
}
if(!file.exists(paste0(mmseqs2path, "mmseqs"))){
stop("Error: mmseqs2 binary does not exist. Please specify
correct PATH and/or look into package installation
prerequisites.")
}
}
if(searchtool=="diamond"){
if(!dir.exists(diamondpath)){
stop("Error: diamond PATH does not exist. Please specify
correct PATH and/or look into package installation
prerequisites.")
}
if(!file.exists(paste0(diamondpath, "diamond"))){
stop("Error: diamond binary does not exist. Please specify
correct PATH and/or look into package installation
prerequisites.")
}
}
if(searchtool=="lambda3"){
if(!dir.exists(lambda3path)){
stop("Error: lambda3 PATH does not exist. Please specify
correct PATH and/or look into package installation
prerequisites.")
}
if(!file.exists(paste0(lambda3path, "lambda3"))){
stop("Error: lambda3 binary does not exist. Please specify
correct PATH and/or look into package installation
prerequisites.")
}
}
selfblast <- FALSE
if(cdsfile1==cdsfile2){
selfblast <- TRUE
}
aa1file <- tempfile("aa1_", outpath)
aa2file <- tempfile("aa2_", outpath)
aa1dbfile <- tempfile("aa1db_", outpath)
aa2dbfile <- tempfile("aa2db_", outpath)
if(!is.null(dbfile1)){
aa1dbfile <- dbfile1
}
if(!is.null(dbfile2)){
aa2dbfile <- dbfile2
}
if(searchtool=="last"){
aa2_aa1_lastout <- tempfile("aa2_aa1_lastout_", outpath)
aa1_aa2_lastout <- tempfile("aa1_aa2_lastout_", outpath)
}
if(searchtool=="mmseqs2"){
aa2_aa1_lastout <- tempfile("aa2_aa1_mmseqs2_", outpath)
aa1_aa2_lastout <- tempfile("aa1_aa2_mmseqs2_", outpath)
}
if(searchtool=="diamond"){
aa2_aa1_lastout <- tempfile("aa2_aa1_diamond_", outpath)
aa1_aa2_lastout <- tempfile("aa1_aa2_diamond_", outpath)
}
if(searchtool=="lambda3"){
aa1file <- tempfile("aa1_", outpath, fileext = ".fa")
aa2file <- tempfile("aa2_", outpath, fileext = ".fa")
aa1dbfile <- tempfile("aa1db_", outpath, fileext = ".lba")
aa2dbfile <- tempfile("aa2db_", outpath, fileext = ".lba")
aa2_aa1_lastout <- tempfile("aa2_aa1_lambda3_", outpath,
fileext = ".m8")
aa1_aa2_lastout <- tempfile("aa1_aa2_lambda3_", outpath,
fileext = ".m8")
}
if(longest.isoform){
cds1 <- Biostrings::readDNAStringSet(cdsfile1)
cds2 <- Biostrings::readDNAStringSet(cdsfile2)
Biostrings::writeXStringSet(MSA2dist::cds2aa(isoform2longest(cds1,
isoform.source), shorten=shorten1, frame=frame1,
framelist=framelist1, genetic.code=genetic.code1), file=aa1file)
Biostrings::writeXStringSet(MSA2dist::cds2aa(isoform2longest(cds2,
isoform.source), shorten=shorten2, frame=frame2,
framelist=framelist2, genetic.code=genetic.code2), file=aa2file)
}
if(!longest.isoform){
cdsfile2aafile(cdsfile1, aa1file,
shorten=shorten1, frame=frame1, framelist=framelist1,
genetic.code=genetic.code1)
cdsfile2aafile(cdsfile2, aa2file,
shorten=shorten2, frame=frame2, framelist=framelist2,
genetic.code=genetic.code2)
}
if(searchtool=="last"){
system2(command=paste0(lastpath, "lastdb"),
args = c("-p", "-cR01", "-P", threads, aa1dbfile, aa1file))
system2(command=paste0(lastpath, "lastdb"),
args = c("-p", "-cR01", "-P", threads, aa2dbfile, aa2file))
system2(command=paste0(lastpath, "lastal"),
args = c("-f", "BlastTab+", "-P", threads, "-D", lastD, "-m", lastm,
aa1dbfile, aa2file, ">", aa2_aa1_lastout))
system2(command=paste0(lastpath, "lastal"),
args = c("-f", "BlastTab+", "-P", threads, "-D", lastD, "-m", lastm,
aa2dbfile, aa1file, ">", aa1_aa2_lastout))
}
if(searchtool=="mmseqs2"){
system2(command=paste0(mmseqs2path, "mmseqs"),
args = c("easy-search", aa1file, aa2file, aa1_aa2_lastout, outpath,
"--threads", threads, "-s", mmseqs2sensitivity,
"--max-seqs", mmseqs2maxseqs,
"--format-output", paste0("query,target,fident,alnlen,",
"mismatch,gapopen,qstart,qend,tstart,tend,evalue,bits,qlen,",
"tlen,raw")))
system2(command=paste0(mmseqs2path, "mmseqs"),
args = c("easy-search", aa2file, aa1file, aa2_aa1_lastout, outpath,
"--threads", threads, "-s", mmseqs2sensitivity,
"--max-seqs", mmseqs2maxseqs,
"--format-output", paste0("query,target,fident,alnlen,",
"mismatch,gapopen,qstart,qend,tstart,tend,evalue,bits,qlen,",
"tlen,raw")))
}
if(searchtool=="diamond"){
system2(command=paste0(diamondpath, "diamond"),
args = c("makedb", "--in", aa1file,
"-d", aa1dbfile))
system2(command=paste0(diamondpath, "diamond"),
args = c("makedb", "--in", aa2file,
"-d", aa2dbfile))
system2(command=paste0(diamondpath, "diamond"),
args = c("blastp", "--ignore-warnings", "-d", aa2dbfile,
"-q", aa1file, "-o", aa1_aa2_lastout, diamondsensitivity,
"--max-target-seqs", diamondmaxtargetseqs,
"-f", "6", "qseqid", "sseqid", "pident", "length", "mismatch",
"gapopen", "qstart", "qend", "sstart", "send", "evalue",
"bitscore", "qlen", "slen", "score", "--threads", threads))
system2(command=paste0(diamondpath, "diamond"),
args = c("blastp", "--ignore-warnings", "-d", aa1dbfile,
"-q", aa2file, "-o", aa2_aa1_lastout, diamondsensitivity,
"--max-target-seqs", diamondmaxtargetseqs,
"-f", "6", "qseqid", "sseqid", "pident", "length", "mismatch",
"gapopen", "qstart", "qend", "sstart", "send", "evalue",
"bitscore", "qlen", "slen", "score", "--threads", threads))
}
if(searchtool=="lambda3"){
system2(command=paste0(lambda3path, "lambda3"),
args = c("mkindexp", "-d", aa1file, "-i", aa1dbfile,
"--threads", threads))
system2(command=paste0(lambda3path, "lambda3"),
args = c("mkindexp", "-d", aa2file, "-i", aa2dbfile,
"--threads", threads))
system2(command=paste0(lambda3path, "lambda3"),
args = c("searchp", "-i", aa2dbfile, "-q", aa1file,
"-o", aa1_aa2_lastout, "-p", lambda3sensitivity,
"--num-matches", lambda3nummatches, "--output-columns",
"'std qlen slen score'", "--threads", threads))
system2(command=paste0(lambda3path, "lambda3"),
args = c("searchp", "-i", aa1dbfile, "-q", aa2file,
"-o", aa2_aa1_lastout, "-p", lambda3sensitivity,
"--num-matches", lambda3nummatches, "--output-columns",
"'std qlen slen score'", "--threads", threads))
}
aa1_aa2 <- read.table(aa1_aa2_lastout, sep="\t", header=FALSE,
stringsAsFactors=FALSE)
aa2_aa1 <- read.table(aa2_aa1_lastout, sep="\t", header=FALSE,
stringsAsFactors=FALSE)
colnames(aa1_aa2) <- colnames(aa2_aa1) <- c("query_id", "subject_id",
"perc_identity", "alignment_length", "mismatches", "gap_opens",
"q_start", "q_end", "s_start", "s_end", "evalue", "bit_score",
"query_length", "subject_length", "raw_score")
if(searchtool=="mmseqs2"){
aa1_aa2[, "perc_identity"] <- aa1_aa2[, "perc_identity"] * 100
aa2_aa1[, "perc_identity"] <- aa2_aa1[, "perc_identity"] * 100
}
if(remove){
system2(command="rm", args = aa1file)
system2(command="rm", args = aa2file)
system2(command="rm", args = aa2_aa1_lastout)
system2(command="rm", args = aa1_aa2_lastout)
}
if(remove){
system2(command="rm", args = paste0(aa1dbfile, "*"))
system2(command="rm", args = paste0(aa2dbfile, "*"))
}
#selfblast
if(selfblast){
aa1_aa2 <- aa1_aa2[aa1_aa2[,1] != aa1_aa2[,2], , drop=FALSE]
aa2_aa1 <- aa2_aa1[aa2_aa1[,1] != aa2_aa1[,2], , drop=FALSE]
if(dim(aa1_aa2)[1]==0 & dim(aa2_aa1)[1]==0){
stop("No recirpocal best hits!")
}
}
#apply standard filters on hit pairs
aa1_aa2 <- aa1_aa2 %>% filter_eval(eval) %>% filter_qcov(qcov) %>%
filter_tcov(tcov) %>% filter_pident(pident) %>% filter_alnlen(alnlen)
aa2_aa1 <- aa2_aa1 %>% filter_eval(eval) %>% filter_qcov(qcov) %>%
filter_tcov(tcov) %>% filter_pident(pident) %>% filter_alnlen(alnlen)
if(rost1999){
aa1_aa2 <- aa1_aa2 %>% filter_rost1999
aa2_aa1 <- aa2_aa1 %>% filter_rost1999
}
#apply additional filters on hit pairs
for(f_ in filter){
aa1_aa2 <- aa1_aa2 %>% f_
aa2_aa1 <- aa2_aa1 %>% f_
}
if(dim(aa1_aa2)[1]==0 & dim(aa2_aa1)[1]==0){
stop("No recirpocal best hits!")
}
aa1_aa2.idx <- paste0(aa1_aa2[, 1], "\t" , aa1_aa2[, 2])
aa2_aa1.idx <- paste0(aa2_aa1[, 2], "\t" , aa2_aa1[, 1])
#deduplicate hit pairs and only retain the best hit per query
aa1_aa2.dedup <- aa1_aa2[!duplicated(aa1_aa2[, 1]), , drop=FALSE]
aa2_aa1.dedup <- aa2_aa1[!duplicated(aa2_aa1[, 1]), , drop=FALSE]
aa1_aa2.dedup.idx <- paste0(aa1_aa2.dedup[, 1], "\t" , aa1_aa2.dedup[, 2])
aa2_aa1.dedup.idx <- paste0(aa2_aa1.dedup[, 2], "\t" , aa2_aa1.dedup[, 1])
#reduce to reciprocal best hits
rbh1 <- aa1_aa2.dedup[which(aa1_aa2.dedup.idx %in% aa2_aa1.dedup.idx), ,
drop=FALSE]
rbh2 <- aa2_aa1.dedup[which(aa2_aa1.dedup.idx %in% aa1_aa2.dedup.idx), ,
drop=FALSE]
if(selfblast){
rbh1 <- rbh1[!duplicated(apply(rbh1[, seq_len(2)], 1, function(x)
paste0(sort(x), collapse="\t"))), , drop=FALSE]
rbh2 <- rbh2[!duplicated(apply(rbh2[, seq_len(2)], 1, function(x)
paste0(sort(x), collapse="\t"))), , drop=FALSE]
}
#if no crbh - done
if(!crbh){
rbh <- rbh1[, seq_len(2)]
rbh <- cbind(rbh, "rbh")
colnames(rbh) <- c("aa1", "aa2", "rbh_class")
out <- list(rbh, cbind(rbh1, "rbh"), cbind(rbh2, "rbh"))
names(out) <- c("crbh.pairs", "crbh1", "crbh2")
if(!selfblast){
attr(out, "CRBHits.class") <- "crbh"
attr(out, "crbh") <- FALSE
attr(out, "keepSingleDirection") <- FALSE
attr(out, "selfblast") <- selfblast
return(out)
}
if(selfblast){
attr(out, "CRBHits.class") <- "crbh"
attr(out, "crbh") <- FALSE
attr(out, "keepSingleDirection") <- FALSE
attr(out, "selfblast") <- selfblast
return(out)
}
}
#if crbh - continue
if(crbh){
#fit evalue by length
if(!selfblast){
rbh1_rbh2_fit <- fitSpline(c(rbh1[,4], rbh2[,4]),
c(rbh1[, 11], rbh2[, 11]),
fit.type,
fit.varweight,
fit.min)
}
if(selfblast){
rbh1_rbh2_fit <- fitSpline(c(rbh1[,4]),
c(rbh1[, 11]),
fit.type,
fit.varweight,
fit.min)
}
#internal function to filter hit pairs using rbh1_rbh2_fit
filter_crbh <- function(x){
minuslog10evalue_by_fit <- lapply(as.numeric(x[,4]), rbh1_rbh2_fit)
return(x[as.numeric(x[,16]) >= minuslog10evalue_by_fit, ,
drop=FALSE])
}
#remove reciprocal best hits from hit pairs to only look into secondary
#hits
if(!selfblast){
rbh1.idx <- paste0(rbh1[, 1], "\t", rbh1[, 2])
rbh2.idx <- paste0(rbh2[, 2], "\t", rbh2[, 1])
}
if(selfblast){
rbh1.idx <- c(paste0(rbh1[, 1], "\t", rbh1[, 2]), paste0(rbh1[, 2],
"\t", rbh1[, 1]))
rbh2.idx <- c(paste0(rbh2[, 2], "\t", rbh2[, 1]), paste0(rbh2[, 1],
"\t", rbh2[, 2]))
}
aa1_aa2.red <- aa1_aa2[!aa1_aa2.idx %in% rbh1.idx, , drop=FALSE]
aa2_aa1.red <- aa2_aa1[!aa2_aa1.idx %in% rbh2.idx, , drop=FALSE]
#add -log10(evalue)
aa1_aa2.red <- cbind(aa1_aa2.red, -log10(aa1_aa2.red[, 11]))
aa1_aa2.red[is.infinite(aa1_aa2.red[, 16]), 16] <- 324
aa2_aa1.red <- cbind(aa2_aa1.red, -log10(aa2_aa1.red[, 11]))
aa2_aa1.red[is.infinite(aa2_aa1.red[, 16]), 16] <- 324
#filter retained hit pairs with rbh1_rbh2_fit
aa1_aa2.red <- filter_crbh(aa1_aa2.red)
aa2_aa1.red <- filter_crbh(aa2_aa1.red)
aa1_aa2.red.idx <- paste0(aa1_aa2.red[, 1], "\t" , aa1_aa2.red[, 2])
aa2_aa1.red.idx <- paste0(aa2_aa1.red[, 2], "\t" , aa2_aa1.red[, 1])
#deduplicate hit pairs and only retain the best hit per HSP
aa1_aa2.red.dedup <- aa1_aa2.red[!duplicated(aa1_aa2.red.idx), ,
drop=FALSE]
aa2_aa1.red.dedup <- aa2_aa1.red[!duplicated(aa2_aa1.red.idx), ,
drop=FALSE]
#split into reciprocal direction secondary hits (rbh.sec) and single
#direction (single)
aa1_aa2.red.dedup.idx <- paste0(aa1_aa2.red.dedup[, 1], "\t" ,
aa1_aa2.red.dedup[, 2])
aa2_aa1.red.dedup.idx <- paste0(aa2_aa1.red.dedup[, 2], "\t" ,
aa2_aa1.red.dedup[, 1])
rbh1.sec <- aa1_aa2.red.dedup[which(aa1_aa2.red.dedup.idx %in%
aa2_aa1.red.dedup.idx), , drop=FALSE]
rbh2.sec <- aa2_aa1.red.dedup[which(aa2_aa1.red.dedup.idx %in%
aa1_aa2.red.dedup.idx), , drop=FALSE]
if(selfblast){
rbh1.sec <- rbh1.sec[!duplicated(apply(rbh1.sec[, seq_len(2)], 1,
function(x) paste0(sort(x), collapse="\t"))), , drop=FALSE]
rbh2.sec <- rbh2.sec[!duplicated(apply(rbh2.sec[, seq_len(2)], 1,
function(x) paste0(sort(x), collapse="\t"))), , drop=FALSE]
}
single1 <- aa1_aa2.red.dedup[which(!aa1_aa2.red.dedup.idx %in%
aa2_aa1.red.dedup.idx), , drop=FALSE]
single2 <- aa2_aa1.red.dedup[which(!aa2_aa1.red.dedup.idx %in%
aa1_aa2.red.dedup.idx), , drop=FALSE]
#if plotCurve plot fitting
if(plotCurve){
len <- rbh1[, 4]
log10alnlen <- log10(len)
minuslog10evalue <- -log10(rbh1[, 11])
minuslog10evalue[is.infinite(minuslog10evalue)] <- 324
plot(x=log10alnlen, y=minuslog10evalue,
pch=20,
main="Accept / Reject secondary hits as homologs",
ylab="-log10(evalue)",
xlab="log10(alnlength)",
col=col2transparent("#8EBCB5", 50),
cex=0.75)
points(x=log10(rbh1.sec[, 4]), y=-log10(rbh1.sec[, 11]),
pch=21,
bg=col2transparent("#4D83AB", 25),
cex=1)
points(x=log10(single1[, 4]), y=-log10(single1[, 11]),
pch=21,
bg=col2transparent("#CBC106", 25),
cex=1)
if(selfblast){
points(x=log10(seq_len(max(rbh1[, 4]))),
y=rbh1_rbh2_fit(seq(from=1, to=max(rbh1[, 4]))),
type="l",
lwd=2,
col="#9E163C")
legend("bottomright",
legend=c("rbh", "sec", "single"),
col=c("#8EBCB5",
col2transparent("#4D83AB", 25),
col2transparent("#CBC106", 25)),
bty="n",
pch=20)
}
if(!selfblast){
points(x=log10(single2[, 4]), y=-log10(single2[, 11]),
pch=21,
bg=col2transparent("#CB7B26", 25),
cex=1)
points(x=log10(seq_len(max(rbh1[, 4]))),
y=rbh1_rbh2_fit(seq(from=1, to=max(rbh1[, 4]))),
type="l",
lwd=2,
col="#9E163C")
legend("bottomright",
legend=c("rbh", "sec", "single1", "single2"),
col=c("#8EBCB5",
col2transparent("#4D83AB", 25),
col2transparent("#CBC106", 25),
col2transparent("#CB7B26", 25)),
bty="n",
pch=20)
}
}
#if no keepSingleDirection - done
if(!keepSingleDirection){
crbh1 <- data.frame(Map(c ,cbind(rbh1, "rbh"),
cbind(rbh1.sec[, seq_len(15)], "sec")))
colnames(crbh1)[16] <- "rbh_class"
crbh2 <- data.frame(Map(c ,cbind(rbh2, "rbh"),
cbind(rbh2.sec[, seq_len(15)], "sec")))
colnames(crbh2)[16] <- "rbh_class"
crbh <- crbh1[, c(seq_len(2), 16)]
colnames(crbh) <- c("aa1", "aa2", "rbh_class")
out <- list(crbh, crbh1, crbh2, rbh1_rbh2_fit)
names(out) <- c("crbh.pairs", "crbh1", "crbh2", "rbh1_rbh2_fit")
attr(out, "CRBHits.class") <- "crbh"
attr(out, "crbh") <- TRUE
attr(out, "keepSingleDirection") <- FALSE
attr(out, "selfblast") <- selfblast
return(out)
}
#if keepSingleDirection - include single - done
if(keepSingleDirection){
crbh1 <- data.frame(Map(c, cbind(rbh1, "rbh"),
cbind(rbh1.sec[, seq_len(15)], "sec"),
cbind(single1[, seq_len(15)],
"single")))
colnames(crbh1)[16] <- "rbh_class"
crbh2 <- data.frame(Map(c, cbind(rbh2, "rbh"),
cbind(rbh2.sec[, seq_len(15)], "sec"),
cbind(single2[, seq_len(15)],
"single")))
colnames(crbh2)[16] <- "rbh_class"
crbh <- data.frame(Map(c, crbh1[, c(seq_len(2), 16)],
single1[, c(seq_len(2), 16)], single2[, c(2, 1, 16)]))
colnames(crbh) <- c("aa1", "aa2", "rbh_class")
out <- list(crbh, crbh1, crbh2, rbh1_rbh2_fit)
names(out) <- c("crbh.pairs", "crbh1", "crbh2", "rbh1_rbh2_fit")
attr(out, "CRBHits.class") <- "crbh"
attr(out, "crbh") <- TRUE
attr(out, "keepSingleDirection") <- TRUE
attr(out, "selfblast") <- selfblast
return(out)
}
}
}