bird_recognition.Rmd

---
title: "Bird Recognition"
output:
  html_notebook:
    toc: TRUE
    toc_float: TRUE
---

GitHub repo: https://github.com/UoA-eResearch/bird_recognition

## Load the necessary packages

```{r setup, warning=FALSE}
library(bioacoustics)
library(tuneR)
library(seewave)
library(dplyr)
library(tools)
library(randomForest)
library(stringr)
library(keras)
```

## Load the wave files. Separate out the metadata from the filename into columns. Render some spectrograms

```{r load}
files = list.files("wav_files_playback", "*.wav", full.names=TRUE)
files_without_extension = basename(file_path_sans_ext(files))
wavs = setNames(
  lapply(
    files,
    read_audio
  ),
  files_without_extension
)
metadata = data.frame(str_split_fixed(files_without_extension, "_", 3))
metadata = cbind(metadata, files_without_extension)
colnames(metadata) = c("birdid", "calltype", "idnumber", "filename")
head(metadata)
oscillo(wavs[[1]])
birds = unique(metadata$birdid)
filtered = filter(metadata, birdid == birds[1])
for (i in 1:nrow(filtered)) {
  audio = wavs[[filtered$filename[i]]]
  spectro(audio, main=filtered$filename[i])
}
```

Each calltype seems to have a unique spectrogram.

## Split dataset into train / test. The test dataset will contain one row for each combination of birdid / calltype

```{r split}
test = metadata %>% 
          group_by(birdid, calltype) %>% 
          filter(row_number()==1)
test
train = metadata %>% 
          group_by(birdid, calltype) %>% 
          filter(row_number()!=1)
train
```

## Train a random forest classifier based on Mel-frequency Cepstral Coefficients (MFCC)

```{r rf}
mels = sapply(wavs, melfcc, numcep = 20) # Calculate all MFCCs
head(mels[[1]]) # Result is a matrix of 20 coefficients across nrow time frames
image(mels[[1]])

trainMels = mels[train$filename] # Select the MFCCs corresponding to the training dataset
trainM = do.call(rbind, trainMels) # melfcc gives a matrix - rbind to cast from 3D to 2D across the whole training dataset
trainM.labels = rep(train$birdid, lapply(trainMels, function(x) dim(x)[1])) # Create birdid labels for each MFCC
rownames(trainM) = trainM.labels
head(trainM)

set.seed(1337)
rf = randomForest(trainM, trainM.labels, importance = FALSE, proximity = FALSE, replace = TRUE, ntree = 4000, mtry = 4)
rf
```
### Check accuracy of randomForest model

```{r rf-acc}
testMels = mels[test$filename]
testM = do.call(rbind, testMels)
testM.labels = rep(test$birdid, lapply(testMels, function(x) dim(x)[1]))
rownames(testM) = testM.labels
head(testM)

# Let's make predictions with our classifier on a test set
table = table(testM.labels, predict(rf, testM, type = "response"))
table

accuracy_pct = round(sum(diag(table)) / sum(table) * 100, 2)
print(paste("accuracy across whole dataset", accuracy_pct, "%"))

# To look at the predictions 
predict(rf, testM, type = "prob")

predictions = character(nrow(test))
for (i in 1:nrow(test)) {
  mel = testMels[[i]]
  prediction = names(which.max(colMeans(predict(rf, mel, type="prob"))))
  predictions[i] = prediction
}
testWithPredictions = data.frame(test, predictions)
testWithPredictions
correct_predictions = nrow(filter(testWithPredictions, birdid == predictions))
accuracy_pct = round(correct_predictions / nrow(test) * 100, 2)
print(paste(correct_predictions, "/", nrow(test), "wavs in the test dataset correctly identifed. Accuracy: ", accuracy_pct, "%"))
```


## Deep learning classification with the R interface to Keras based on MFCC

```{r keras, warning=FALSE}
X_train = trainM
Y_train = to_categorical(as.integer(trainM.labels) - 1)
X_test = testM
Y_test = to_categorical(as.integer(testM.labels) - 1)

# Build the sequential model
model = keras_model_sequential()
model %>%
  # Input shape layer = c(samples, rows, cols, channels)
  layer_reshape(input_shape=ncol(X_train),target_shape=c(1,1,ncol(X_train))) %>% 
  # First conv 2d layer with 128 neurons, kernel size of 8 x 8 and stride of 1 x 1
  layer_conv_2d(128, c(8,8), c(1,1), padding='same') %>%
  layer_batch_normalization() %>%
  layer_activation("relu") %>%
  layer_dropout(0.2) %>%
  # Second conv 2d layer with 256 neurons, kernel size of 5 x 5 and stride of 1 x 1
  layer_conv_2d(256, c(5,5), c(1,1), padding='same') %>%
  layer_batch_normalization() %>%
  layer_activation("relu") %>%
  layer_dropout(0.2) %>%
  # Third conv 2d layer with 128 neurons, kernel size of 3 x 3 and stride of 1 x 1
  layer_conv_2d(128, c(3,3), c(1,1), padding='same') %>%
  layer_batch_normalization() %>%
  layer_activation("relu") %>%
  layer_dropout(0.2) %>%
  # Average pooling layer
  layer_global_average_pooling_2d() %>%
  # Activation output layer with 2 classes
  layer_dense(units = ncol(Y_train),  activation='softmax')

# Model compile
model %>% compile(loss = 'categorical_crossentropy',
                 optimizer = "adam",
                 metrics = "categorical_accuracy")


# Add a callback to reduce the learning rate when reaching the plateau
reduce_lr <- callback_reduce_lr_on_plateau(monitor = 'loss', factor = 0.5,
                                           patience = 50, min_lr = 0.0001)
# Start learning
history = model %>% fit(X_train, Y_train, batch_size = 32, epochs = 50,
             validation_data = list(X_test, Y_test),
             verbose = 0, callbacks = reduce_lr)
```

```{r keras-plot}
plot(history)
```

### Check accuracy of keras model

```{r keras-acc}
# Score on the test set
model %>% evaluate(X_test, Y_test, batch_size = 32)

predictions = character(nrow(test))
for (i in 1:nrow(test)) {
  mel = testMels[[i]]
  prediction = which.max(colMeans(predict_proba(model, mel)))
  prediction = levels(testM.labels)[prediction]
  predictions[i] = prediction
}
testWithPredictions = data.frame(test, predictions)
testWithPredictions
correct_predictions = nrow(filter(testWithPredictions, birdid == predictions))
accuracy_pct = round(correct_predictions / nrow(test) * 100, 2)
print(paste(correct_predictions, "/", nrow(test), "wavs in the test dataset correctly identifed. Accuracy: ", accuracy_pct, "%"))
```

randomForest did better.