-
Notifications
You must be signed in to change notification settings - Fork 15
/
sort_data.py
119 lines (86 loc) · 3.84 KB
/
sort_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
# Use this script to sort the BirdCLEF 2018 dataset
# Unpack the archive file containing xml-files and wav-files first
# Use config.py to specify source and target paths
# Author: Stefan Kahl, 2018, Chemnitz University of Technology
import os
import json
from sklearn.utils import shuffle
from shutil import copyfile
import xmltodict as x2d
import config as cfg
################### METADATA HANDLING ####################
def parseDataset():
metadata = {}
# List of wav-files
wav_path = os.path.join(cfg.TRAINSET_PATH, 'wav')
wav_files = [f for f in sorted(os.listdir(wav_path))]
print 'DATASET CONTAINS', len(wav_files), 'WAV_FILES'
# List all xml-files
xml_path = os.path.join(cfg.TRAINSET_PATH, 'xml')
xml_files = [os.path.join(xml_path, f) for f in sorted(os.listdir(xml_path))]
print 'PARSING', len(xml_files), 'XML-FILES...'
# Open xml-files and extract metadata
for i in range(len(xml_files)):
# Read contnet
xml = open(xml_files[i], 'r').read()
data = x2d.parse(xml)
# The 2017 dataset has no annotated background species
# We have to handle those separately
try:
background = data['Audio']['BackgroundSpecies'].split(',')
except:
background = []
# Create new metadata
mdata = {'sci-name': data['Audio']['Genus'] + ' ' + data['Audio']['Species'],
'species': data['Audio']['VernacularNames'].split(',')[0],
'background': background,
'filename': data['Audio']['FileName'],
'classid': data['Audio']['ClassId']}
# Save metadata to dict if wav-file exists
if mdata['filename'] in wav_files:
if not mdata['classid'] in metadata:
metadata[mdata['classid']] = []
metadata[mdata['classid']].append(mdata)
# Status (parsing the files might take a while)
if not i % 100:
print '\t', i, '/', len(xml_files)
print '...DONE!', len(metadata), 'CLASSES IN DATASET'
return metadata
#################### CREATE SPLITS #####################
def sortDataset(mdata):
print 'PARSING CLASSES...'
# Parse classes
for c in mdata:
print '\t', c
# Determine size of val split (10% but at least 1 file)
val = max(1, len(mdata[c]) * 0.1)
# Shuffle list of files
mdata[c] = shuffle(mdata[c], random_state=cfg.getRandomState())
# Parse list of files and copy to destination
for f in mdata[c]:
# Get class name (we use the sci-name which makes it easier to evaluate with background species)
# The submission format uses class id only - so we have to figure that out later
cname = f['sci-name']
# Make folders
m_path = os.path.join(cfg.TRAINSET_PATH, 'metadata')
if not os.path.exists(m_path):
os.makedirs(m_path)
t_path = os.path.join(cfg.TRAINSET_PATH, 'train', cname)
if not os.path.exists(t_path):
os.makedirs(t_path)
v_path = os.path.join(cfg.TRAINSET_PATH, 'val', cname)
if not os.path.exists(v_path):
os.makedirs(v_path)
# Copy files
with open(os.path.join(m_path, f['filename'].rsplit('.')[0] + '.json'), 'w') as mfile:
json.dump(f, mfile)
if mdata[c].index(f) < val:
copyfile(os.path.join(cfg.TRAINSET_PATH, 'wav', f['filename']), os.path.join(v_path, f['filename']))
else:
copyfile(os.path.join(cfg.TRAINSET_PATH, 'wav', f['filename']), os.path.join(t_path, f['filename']))
print '...DONE!'
if __name__ == '__main__':
# Create metadata for entire dataset
metadata = parseDataset()
# Split into train and val, copy files
sortDataset(metadata)