-
Notifications
You must be signed in to change notification settings - Fork 8
/
mg_emo_dataset.py
191 lines (179 loc) · 12.7 KB
/
mg_emo_dataset.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
from dataloaders.char_dataset import char_dataset
from dataloaders.scene_dataset import scene_dataset
from dataloaders.srt_dataset import srt_dataset
from pathlib import Path
from torch.utils.data import Dataset
from utils.movie_scene_mapper import movie_scene_mapper
import time
import torch
import yaml
class character_emo_dataset(Dataset):
"""
Dataset object used to load scene, character and subtitle features along with multilabel targets for emotion recogntion.
"""
def __init__(self, config, movie_ids, split_type, random_feat_selection=True, with_srt=True, emo2id=dict()):
"""
Args:
config (dict): Configuration dictionary.
movie_ids (list): List of movie ids.
split_type (str): Split type. Can be train, val or test.
random_feat_selection (bool): Whether to select features randomly or uniformly based on sampling strategy.
with_srt (bool): Whether to load subtitle features.
emo2id (dict): Dictionary mapping emotion to id.
"""
self.config = config
self.data_path = Path(config["data_path"])
self.scene_feat_dim = config["feat_info"][config["scene_feat_type"]]["scene_feat_dim"]
self.char_feat_dim = config["feat_info"][config["face_feat_type"]]["face_feat_dim"]
self.srt_feat_dim = config["feat_info"][config["srt_feat_model"]]["srt_feat_dim"]
self.feat_sampling_rate = config["feat_sampling_rate"]
self.num_chars = config["num_chars"]
self.with_srt = with_srt
self.max_features = config["max_feats"]
self.vid_fps = config["vid_fps"]
self.bin_size = self.feat_sampling_rate/self.vid_fps
self.num_bins = int(self.config["max_possible_vid_frame_no"]/self.feat_sampling_rate)
self.fps_boundaries = torch.cumsum(torch.tensor([0]+[self.bin_size]*self.num_bins), dim=0)
self.split_type = split_type
self.char_dataset_obj = char_dataset(config, movie_ids, random_feat_selection)
self.movie_scene_map = movie_scene_mapper(self.config, movie_ids, emo2id,
char_level=True,
extended_named_track_info=self.char_dataset_obj.new_named_track_cache,
emotic_mapped=config["use_emotic_mapping"])
self.top_k = self.movie_scene_map.top_k
self.scene_dataset_obj = scene_dataset(config, random_feat_selection)
self.srt_dataset_obj = srt_dataset(config, movie_ids, self.top_k)
def get_emo2id_map(self):
"""
Method to get emotion to id mapping.
Returns:
dict: Dictionary mapping emotion to id.
"""
return self.movie_scene_map.get_emo2id_mapping()
def collate(self, batches):
"""
Method to collate batches.
Based on config parameters, it loads scene, character and subtitle features along with their respective targets.
Args:
batches (list): List of batches.
Returns:
collated_data (dict): Collated batches with features, masks and targets.
"""
char_count_in_scenes = [len(batch["targets"][0]) for batch in batches]
if self.config["get_scene_targets"]:
scene_target_mask = torch.cat([torch.ones((len(batches), 1, self.top_k)), torch.zeros((len(batches), self.num_chars, self.top_k))], dim=1).flatten().type(torch.bool)
scene_targets = torch.stack([resp["targets"][1] for resp in batches])
else:
scene_target_mask = torch.zeros((len(batches)*(self.num_chars+1)*self.top_k)).type(torch.bool)
scene_targets = torch.zeros((0, self.top_k))
if self.config["get_char_targets"]:
char_targets_raw = [torch.stack(list(resp["targets"][0].values()))[:self.num_chars] for resp in batches]
char_target_mask = torch.stack([torch.vstack([torch.ones(target.shape), torch.zeros((max(0,self.num_chars-target.shape[0]),self.top_k))]) for target in char_targets_raw])
char_target_mask = torch.cat([torch.zeros((len(batches), 1, self.top_k)), char_target_mask], dim=1).flatten().type(torch.bool)
char_targets = torch.cat([torch.stack(list(resp["targets"][0].values())[:self.num_chars]) for resp in batches], dim=0)
else:
char_target_mask = torch.zeros((len(batches), self.num_chars+1, self.top_k)).flatten().type(torch.bool)
char_targets = torch.zeros((0, self.top_k))
if self.config["use_scene_feats"]:
scene_feats = torch.stack([resp["feats"][1] for resp in batches]).type(torch.float)
scene_frames = torch.stack([resp["timestamps"][1] for resp in batches])
scene_frames = torch.bucketize(scene_frames/self.vid_fps, self.fps_boundaries)
scene_masks = torch.stack([resp["masks"][1] for resp in batches])
else:
scene_feats = torch.zeros((len(batches), self.max_features, self.scene_feat_dim)).type(torch.float)
scene_frames = torch.zeros((len(batches), self.max_features)).to(torch.long)
scene_masks = torch.ones((len(batches), self.max_features))
if self.config["use_char_feats"]:
if self.config["joint_character_modeling"]:
empty_char_feat = torch.zeros((self.max_features, self.char_feat_dim))
padded_char_mask = torch.ones((self.max_features))
char_feats = [torch.stack(list(resp["feats"][0].values()))[:self.num_chars] for resp in batches]
char_feats = torch.stack([torch.vstack([feat, empty_char_feat.repeat(max(0,self.num_chars-feat.shape[0]),1,1)]) for feat in char_feats]).type(torch.float)
char_frames = [torch.stack(list(resp["timestamps"][0].values()))[:self.num_chars] for resp in batches]
char_frames = torch.stack([torch.vstack([frames, torch.zeros((self.max_features)).repeat(max(0,self.num_chars-frames.shape[0]),1)]) for frames in char_frames])
char_frames = torch.bucketize(char_frames/self.vid_fps, self.fps_boundaries)
char_masks = [torch.stack(list(resp["masks"][0].values()))[:self.num_chars] for resp in batches]
char_masks = torch.stack([torch.vstack([mask, padded_char_mask.repeat(max(0,self.num_chars-mask.shape[0]),1)]) for mask in char_masks])
if self.config["shuffle_characters"]:
shuffled_ndxs = torch.randperm(self.num_chars)
char_feats = char_feats[:,shuffled_ndxs,:,:]
char_frames = char_frames[:,shuffled_ndxs,:]
char_masks = char_frames[:,shuffled_ndxs,:]
char_target_mask = char_target_mask.reshape(char_feats.shape[0], self.num_chars+1, self.top_k)[:,1:,:]
char_target_mask = char_target_mask[:,shuffled_ndxs,:]
char_target_mask = torch.cat([torch.zeros((scene_feats.shape[0], 1, self.top_k)), char_target_mask], dim=1).flatten().type(torch.bool)
char_targets = torch.stack([torch.vstack([target, torch.zeros((self.top_k)).repeat(max(0,self.num_chars-target.shape[0]),1)]) for target in char_targets_raw])
char_targets = char_targets[:,shuffled_ndxs,:]
stacked_tgt = torch.cat([torch.zeros(scene_feats.shape[0], 1, self.top_k), char_targets], dim=1)
char_targets = stacked_tgt.flatten()[char_target_mask].reshape(-1, self.top_k)
else:
scene_feats = torch.cat([feat.repeat(char_count_in_scenes[ndx], 1, 1) for ndx, feat in enumerate(scene_feats)])
scene_frames = torch.cat([frames.repeat(char_count_in_scenes[ndx], 1) for ndx, frames in enumerate(scene_frames)])
scene_masks = torch.cat([mask.repeat(char_count_in_scenes[ndx], 1) for ndx, mask in enumerate(scene_masks)])
scene_target_mask = torch.cat([mask.repeat(char_count_in_scenes[ndx], 1) for ndx, mask in enumerate(scene_target_mask.reshape(len(batches), self.num_chars+1, self.top_k))]).flatten().to(torch.bool)
scene_targets = torch.cat([target.repeat(char_count_in_scenes[ndx], 1) for ndx, target in enumerate(scene_targets)]) if scene_targets.shape[0] else scene_targets
char_feats = torch.cat([torch.stack(list(resp["feats"][0].values())) for resp in batches], dim=0)
char_feats = torch.cat([char_feats.unsqueeze(1), torch.zeros((char_feats.shape[0], self.num_chars-1, self.max_features, self.char_feat_dim))], dim=1).type(torch.float)
char_frames = torch.cat([torch.stack(list(resp["timestamps"][0].values())) for resp in batches], dim=0)
char_frames = torch.cat([char_frames.unsqueeze(1), torch.zeros((char_frames.shape[0], self.num_chars-1, self.max_features))], dim=1)
char_frames = torch.bucketize(char_frames/self.vid_fps, self.fps_boundaries)
char_masks = torch.cat([torch.stack(list(resp["masks"][0].values())) for resp in batches], dim=0)
char_masks = torch.cat([char_masks.unsqueeze(1), torch.ones((char_masks.shape[0]), self.num_chars-1, self.max_features)], dim=1)
char_target_mask = torch.cat([torch.ones((char_feats.shape[0], 1, self.top_k)), torch.zeros((char_feats.shape[0], self.num_chars-1, self.top_k))], dim=1)
char_target_mask = torch.cat([torch.zeros((scene_feats.shape[0], 1, self.top_k)), char_target_mask], dim=1).flatten().type(torch.bool)
char_targets = torch.cat([torch.stack(list(resp["targets"][0].values())) for resp in batches], dim=0)
else:
char_feats = torch.zeros((scene_feats.shape[0], self.num_chars, self.max_features, self.char_feat_dim)).type(torch.float)
char_frames = torch.zeros((scene_feats.shape[0], self.num_chars, self.max_features)).to(torch.long)
char_masks = torch.ones((scene_feats.shape[0], self.num_chars, self.max_features))
if self.config["use_srt_feats"]:
srt_feats = torch.stack([resp["feats"][2] for resp in batches])
srt_times = torch.stack([resp["timestamps"][2] for resp in batches])
srt_bins = torch.bucketize(srt_times, self.fps_boundaries)
srt_masks = torch.stack([resp["masks"][2] for resp in batches])
if not self.config["joint_character_modeling"]:
srt_feats = torch.cat([feat.repeat(char_count_in_scenes[ndx], 1, 1) for ndx, feat in enumerate(srt_feats)])
srt_bins = torch.cat([bins.repeat(char_count_in_scenes[ndx], 1) for ndx, bins in enumerate(srt_bins)])
srt_masks = torch.cat([mask.repeat(char_count_in_scenes[ndx], 1) for ndx, mask in enumerate(srt_masks)])
else:
srt_feats = torch.zeros((scene_feats.shape[0], self.max_features, self.srt_feat_dim))
srt_times = torch.zeros((scene_feats.shape[0], self.max_features))
srt_masks = torch.ones((scene_feats.shape[0], self.max_features))
srt_bins = torch.zeros((scene_feats.shape[0], self.max_features)).to(torch.long)
collated_data = {
"feats": [char_feats, scene_feats, srt_feats, char_frames, scene_frames, srt_bins],
"masks": [char_masks, scene_masks, srt_masks, char_target_mask, scene_target_mask],
"targets": [char_targets, scene_targets],
}
return collated_data
def __getitem__(self, ndx):
"""
Returns a dictionary of features, masks, and targets for the given index.
Args:
ndx (int): Index of the item to be returned.
Returns:
response (dict): A dictionary of features, masks, and targets.
"""
scenes, targets = self.movie_scene_map[ndx]
chars = list(targets['chars'].keys())
scene_feat, scene_feat_mask, scene_frames = None, None, None
char_feats, char_feat_masks, char_frames = None, None, None
srt_feats, srt_pad_mask, srt_times = None, None, None
if self.config["use_scene_feats"]:
scene_feat, scene_feat_mask, scene_frames = self.scene_dataset_obj.get_scene_feat(scenes)
if self.config["use_char_feats"]:
char_feats, char_feat_masks, char_frames = self.char_dataset_obj.get_character_feat(scenes, chars)
if self.config["use_srt_feats"]:
srt_feats, srt_times, srt_pad_mask = self.srt_dataset_obj.get_srt_feats(scenes)
response = {
"feats": [char_feats, scene_feat, srt_feats],
"masks": [char_feat_masks, scene_feat_mask, srt_pad_mask],
"timestamps": [char_frames, scene_frames, srt_times],
"targets": [targets["chars"], targets["scene"]],
}
return response
def __len__(self):
"""
Returns the length of the dataset.
"""
return len(self.movie_scene_map)