sort_data.py

# Use this script to sort the BirdCLEF 2018 dataset
# Unpack the archive file containing xml-files and wav-files first
# Use config.py to specify source and target paths
# Author: Stefan Kahl, 2018, Chemnitz University of Technology

import os
import json

from sklearn.utils import shuffle
from shutil import copyfile
import xmltodict as x2d

import config as cfg

################### METADATA HANDLING ####################
def parseDataset():

    metadata = {}

    # List of wav-files
    wav_path = os.path.join(cfg.TRAINSET_PATH, 'wav')
    wav_files = [f for f in sorted(os.listdir(wav_path))]
    print 'DATASET CONTAINS', len(wav_files), 'WAV_FILES'

    # List all xml-files
    xml_path = os.path.join(cfg.TRAINSET_PATH, 'xml')
    xml_files = [os.path.join(xml_path, f) for f in sorted(os.listdir(xml_path))]
    print 'PARSING', len(xml_files), 'XML-FILES...'

    # Open xml-files and extract metadata
    for i in range(len(xml_files)):

        # Read contnet
        xml = open(xml_files[i], 'r').read()
        data = x2d.parse(xml)

        # The 2017 dataset has no annotated background species
        # We have to handle those separately
        try:
            background = data['Audio']['BackgroundSpecies'].split(',')
        except:
            background = []

        # Create new metadata
        mdata = {'sci-name': data['Audio']['Genus'] + ' ' + data['Audio']['Species'],
                 'species': data['Audio']['VernacularNames'].split(',')[0],
                 'background': background,
                 'filename': data['Audio']['FileName'],
                 'classid': data['Audio']['ClassId']}

        # Save metadata to dict if wav-file exists
        if mdata['filename'] in wav_files:
            if not mdata['classid'] in metadata:
                metadata[mdata['classid']] = []
            metadata[mdata['classid']].append(mdata)

        # Status (parsing the files might take a while)
        if not i % 100:
            print '\t', i, '/', len(xml_files)
                
    print '...DONE!', len(metadata), 'CLASSES IN DATASET'

    return metadata

####################  CREATE SPLITS  #####################
def sortDataset(mdata):

    print 'PARSING CLASSES...'

    # Parse classes
    for c in mdata:

        print '\t', c

        # Determine size of val split (10% but at least 1 file)
        val = max(1, len(mdata[c]) * 0.1)

        # Shuffle list of files
        mdata[c] = shuffle(mdata[c], random_state=cfg.getRandomState())

        # Parse list of files and copy to destination
        for f in mdata[c]:

            # Get class name (we use the sci-name which makes it easier to evaluate with background species)
            # The submission format uses class id only - so we have to figure that out later
            cname = f['sci-name']

            # Make folders
            m_path = os.path.join(cfg.TRAINSET_PATH, 'metadata')
            if not os.path.exists(m_path):
                os.makedirs(m_path)

            t_path = os.path.join(cfg.TRAINSET_PATH, 'train', cname)
            if not os.path.exists(t_path):
                os.makedirs(t_path)

            v_path = os.path.join(cfg.TRAINSET_PATH, 'val', cname)
            if not os.path.exists(v_path):
                os.makedirs(v_path)

            # Copy files
            with open(os.path.join(m_path, f['filename'].rsplit('.')[0] + '.json'), 'w') as mfile:
                json.dump(f, mfile)

            if mdata[c].index(f) < val:
                copyfile(os.path.join(cfg.TRAINSET_PATH, 'wav', f['filename']), os.path.join(v_path, f['filename']))
            else:
                copyfile(os.path.join(cfg.TRAINSET_PATH, 'wav', f['filename']), os.path.join(t_path, f['filename']))

    print '...DONE!'


if __name__ == '__main__':

    # Create metadata for entire dataset
    metadata = parseDataset()

    # Split into train and val, copy files
    sortDataset(metadata)