-
Notifications
You must be signed in to change notification settings - Fork 23
/
getTopDetections.py
120 lines (108 loc) · 4.19 KB
/
getTopDetections.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
import os
import numpy as np
def reduce_by_overlap(detections, indices, overlap_thresh):
discard = []
for i in range(len(indices) - 1):
for j in range(i+1, len(indices)):
index1 = indices[i]
index2 = indices[j]
_, _, conf1, x1min, y1min, x1max, y1max = detections[index1]
_, _, conf2, x2min, y2min, x2max, y2max = detections[index2]
xmin = max(x1min, x2min)
ymin = max(y1min, y2min)
xmax = min(x1max, x2max)
ymax = min(y1max, y2max)
iw = max(xmax - xmin, 0)
ih = max(ymax - ymin, 0)
inters = iw * ih
uni = (x1max - x1min) * (y1max - y1min) + (x2max - x2min) * (y2max - y2min) - inters
overlap = inters * 1.0 / uni
if(overlap > overlap_thresh):
if(conf1 > conf2):
discard.append(index2)
else:
discard.append(index1)
return [index for index in indices if index not in discard]
#CHANGE TO YOUR OWN PATH
detection_results_root = '/detection_results' #where the .npy object detection files are saved
top_detections_root = '/top_detections'
detections = os.listdir(detection_results_root)
total_cls = 15
frame_per_video = 80
confidence_thresh = 0.9
overlap_thresh = 0.7
max_number_of_cls = 2
single_detection_each_class = False
class_count_dic = {}
for detection in detections:
print detection
detection_file = os.path.join(detection_results_root, detection)
detection_npy = np.load(detection_file)
#filter by confidence
confidence_threshed = [[] for _ in range(frame_per_video + 1)]
for index, single_detection in enumerate(detection_npy):
(frame, _, confidence, _, _, _, _) = single_detection
frame = int(frame)
if(confidence > confidence_thresh):
confidence_threshed[frame].append(index)
#filter by overlap
overlap_threshed = [[] for _ in range(total_cls + 1)]
for frame in range(1, frame_per_video + 1):
if(len(confidence_threshed[frame]) == 0):
continue
if(len(confidence_threshed[frame]) > 1):
confidence_threshed[frame] = reduce_by_overlap(detection_npy, confidence_threshed[frame], overlap_thresh)
for index in confidence_threshed[frame]:
(_, cls, _, _, _, _, _) = detection_npy[index]
cls = int(cls)
overlap_threshed[cls].append(index)
#whether only keep the top detection for each class
single_detection_threshed = [[] for _ in range(total_cls + 1)]
if(single_detection_each_class):
for cls in range(1, total_cls + 1):
if(len(overlap_threshed[cls]) == 0):
continue
max_confidence = np.argmax([detection_npy[index][2] for index in overlap_threshed[cls]])
single_detection_threshed[cls].append(overlap_threshed[cls][max_confidence])
else:
for cls in range(1, total_cls + 1):
single_detection_threshed[cls].extend(overlap_threshed[cls])
#whether only keep the top N classes
top_classes_threshed = [[] for _ in range(total_cls + 1)]
max_confidence_dic = {}
for cls in range(1, total_cls + 1):
if(len(single_detection_threshed[cls]) == 0):
continue
max_confidence = np.max([detection_npy[index][2] for index in single_detection_threshed[cls]])
max_confidence_dic[cls] = max_confidence
if len(max_confidence_dic.keys()) <= max_number_of_cls:
top_classes_threshed = single_detection_threshed
else:
inverse = [(value, key) for key, value in max_confidence_dic.items()]
for i in range(max_number_of_cls):
max_cls = max(inverse)[1]
inverse.remove(max(inverse))
top_classes_threshed[max_cls].extend(single_detection_threshed[max_cls])
#combine all classes
results = np.array([])
count = 0
for cls in range(1, total_cls + 1):
if(len(top_classes_threshed[cls]) == 0):
continue
count = count + 1
result = np.array([detection_npy[index] for index in top_classes_threshed[cls]])
if results.shape[0] != 0:
results = np.concatenate((results, result), axis=0)
else:
results = result
if class_count_dic.has_key(count):
class_count_dic[count] = class_count_dic[count] + 1
else:
class_count_dic[count] = 1
# This step ensures the 10s clip used contains an optimal number of detected objects.
# Might need to be tuned for different datasets.
# It can be useful to deal with very noisy datasets like AudioSet.
if results.shape[0] >= 60 and results.shape[0] <= 300:
np.save(os.path.join(top_detections_root, detection), results)
print(count) #number of classes detected in the video
print(class_count_dic)