-
Notifications
You must be signed in to change notification settings - Fork 0
/
TandemRepeatSearcherApprox.java
280 lines (243 loc) · 7.97 KB
/
TandemRepeatSearcherApprox.java
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
import java.util.*;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.FileNotFoundException;
/**
* Compile: javac TandemRepeatSearcherApprox.java
* Usage: time java TandemRepeatSearcherApprox dna.fasta [min max] [error_rate] > output.txt
*/
public class TandemRepeatSearcherApprox {
/** char sequence of dna */
private char[] input;
/** stores a repeats */
private HashSet<Repeat> repeats = new HashSet<>();
/** Minimum length limit of a motif */
private int minLength = 2;
/** Maximum length limit of a motif */
private int maxLength = 8;
/** starting offset of DNA Chunk */
private int offset = 0;
/** Error Rate depending upon length of Repeats */
private double tolerate = 0.4;
public TandemRepeatSearcherApprox(char [] input, int minLength, int maxLength, int offset, double tolerate) {
this.input = input;
this.minLength = minLength;
this.maxLength = maxLength;
this.offset = offset;
this.tolerate = tolerate;
}
public void print() {
for (Repeat repeat : repeats) {
System.out.println(repeat);
}
}
public int getRepeats() {
return repeats.size();
}
public void status() {
System.out.println(repeats.size() + " Repeats found from " + offset + " to " + (input.length + offset));
}
public int getSize() {
return this.input.length;
}
/**
*
* @param start starting index of DNA Sequences (0 inclusive)
* @param end ending index of DNA Sequences (n-1 inclusive)
* @return center of starting and ending index
*/
private int getCenter(int start, int end) {
return (end - start)/2 + start;
}
/**
* This program find exact tandem repeats in a DNA Sequences
* @param args <path/to/fasta_file> [minMotifLength maxMotifLength]
* @throws FileNotFoundException
* @throws IOException
*/
public static void main(String [] args) throws FileNotFoundException, IOException {
int minLength=2;
int maxLength=8;
int chunk_size = 100000;
double tolerate = 0.4;
if (args.length < 1) {
System.out.println("Usage: java TandemRepeatSearcherApprox /path/to/data.fasta [minLength maxLength] [Error Rate]");
System.exit(1);
}
if (args.length == 3) {
minLength = Integer.parseInt(args[1]);
maxLength = Integer.parseInt(args[2]);
}
if (args.length == 4) {
tolerate = Double.parseDouble(args[3]);
}
File file = new File(args[0]);
ExecutorService thpool = Executors.newFixedThreadPool(8);
int size = (int)file.length();
int chunks = (int) Math.ceil ((double)size/chunk_size);
byte bytes[] = new byte[chunk_size];
FileInputStream fis = new FileInputStream(file);
for (int i=0; i < chunks; i++) {
fis.read(bytes);
Runnable worker = new WorkerThread(new String(bytes), i*chunk_size, minLength, maxLength, tolerate, false);
thpool.execute(worker);
}
thpool.shutdown();
while (!thpool.isTerminated()) {
}
}
private static class WorkerThread implements Runnable {
private TandemRepeatSearcherApprox tandemRepeatSearcher;
private boolean print = false;
public WorkerThread(String dna, int offset, int minLength, int maxLength, double tolerate, boolean print) {
this.print = print;
tandemRepeatSearcher = new TandemRepeatSearcherApprox(dna.toCharArray(), minLength, maxLength, offset, tolerate);
}
@Override
public void run() {
tandemRepeatSearcher.findTandemRepeat(0, tandemRepeatSearcher.getSize()-1);
tandemRepeatSearcher.status();
if (print)
tandemRepeatSearcher.print();
}
}
/**
* Divide and Conquer function for finding tandem repeats
* @param start starting index of DNA Sequences (0 inclusive)
* @param end ending index of DNA Sequences (n-1 inclusive)
*
*/
public void findTandemRepeat(final int start, final int end) {
int length = end-start+1;
if (length < 2 * minLength) return;
final int h = getCenter(start, end);
findTandemRepeat(start, h);
findTandemRepeat(h+1, end);
/** check for repeats, prefer right side */
findTandemRepeatOverCenter (start, end, Side.RIGHT);
/** check for repeats, prefer left side */
findTandemRepeatOverCenter (start, end, Side.LEFT);
}
/**
* Find Tandem Repeats from center of input to both side
* @param start start starting index of DNA Sequences (0 inclusive)
* @param end end ending index of DNA Sequences (n-1 inclusive)
* @param side LEFT or RIGHT
*/
public void findTandemRepeatOverCenter(final int start, final int end, final int side) {
final int h = getCenter(start, end);
for (int l = minLength; l <= h - start && l <= maxLength; l++) {
final int q;
if (side == Side.LEFT) {
q = h - l;
} else {
q = h + l;
}
/** longest common extension in backward direction */
final int lceBackward = backwardLce(start, end, h, q);
/** longest common extension in forward direction */
final int lceForward = forwardLce(start, end, h+1, q+1);
if (lceBackward + lceForward >= l) // Tandem Repeat Found
{
final int startPos, endPos;
if (side == Side.LEFT) {
startPos = q - lceBackward + 1;
endPos = h + lceForward;
} else {
startPos = h - lceBackward + 1;
endPos = q + lceForward;
}
/** Add repeat to HashSet */
repeats.add(new Repeat(startPos + offset, lceBackward+lceForward+l, l));
}
}
}
/**
* Calculate Backward longest common extension
* @param start starting index of sequence
* @param end ending index of sequence
* @param i starting index of motif
* @param j ending index of motif
* @return no. of base pairs mathched
*/
public int backwardLce(final int start, final int end, int i, int j) {
int lce = 0;
int t = 0;
final int length = input.length;
for(int k = 0; k <= length; k++, i--, j--) {
lce = k;
if (i < start || i > end) { break; }
if (j < start || j > end) { break; }
if(input[i] != input[j]) {
t++;
if ((double) t/k > tolerate) {
break;
}
}
}
return lce;
}
/**
* Calculate Backward longest common extension
* @param start starting index of sequence
* @param end ending index of sequence
* @param i starting index of motif
* @param j ending index of motif
* @return no. of base pairs mathched
*/
public int forwardLce(final int start, final int end, int i, int j) {
int lce = 0;
int t = 0;
final int length = input.length;
for(int k = 0; k <= length; k++, i++, j++) {
lce = k;
if (i < start || i > end) { break; }
if (j < start || j > end) { break; }
if(input[i] != input[j]) {
t++;
if ((double) t/k > tolerate) {
break;
}
}
}
return lce;
}
class Repeat {
/** motif length */
final int motif;
/** Starting index */
final int start;
/** total length of repeat */
final int length;
Repeat(int start, int length, int motif) {
this.start = start;
this.length = length;
this.motif = motif;
}
@Override
public boolean equals(Object obj) {
if(!(obj instanceof Repeat)) return false;
Repeat other = (Repeat) obj;
return other.start == this.start && (other.length == this.length || other.motif == other.motif);
}
/**
* repeat with same starting point and same length is replaced with later on
* @return hashcode of a Repeat object
*/
@Override
public int hashCode() {
return 17*(int)start + 11*(int)length;
}
@Override
public String toString() {
return "[start: " + start + ", length: " + length + ", motif:" + motif + "]";
}
}
class Side {
public static final int LEFT = 0;
public static final int RIGHT = 1;
}
}