[egs,scripts] Add Tunisian Arabic (MSA) recipe; cosmetic fixes to pbs…

….pl (#2725)
kaldi-asr · Sep 26, 2018 · 6d919f0 · 6d919f0
1 parent f1f9a48
commit 6d919f0
Show file tree

Hide file tree

Showing 33 changed files with 2,210 additions and 26 deletions.
diff --git a/egs/tunisian_msa/s5/README b/egs/tunisian_msa/s5/README
@@ -0,0 +1,24 @@
+A Kaldi recipe for Arabic using the Tunisian_MSA corpus.
+
+Extra Requirements:
+This recipe uses the QCRI lexicon which uses the Buckwalter encoding.
+In order to convert the Buckwalter to utf-8, the Encode::Arabic::Buckwalter perl module is required.
+On ubuntu install the package: libencode-arabic-perl.
+On Mac OSX use cpanm (cpanminus) to install the perl module.
+
+Description of the Tunisian_MSA Corpus
+The Tunisian_MSA corpus was originally collected to train acoustic models for pronunciation modeling in Arabic language learning applications.
+The data collection took place near Tunis the capital of the Republic of Tunisia in 2003 at the Military Academy of Fondouk Jedied . 
+The Tunisian_MSA corpus is divided into recited and prompted speech subcorpora.
+The recited speech appears under the recordings directory and the prompted speech under the answers directory.
+Each of the 118 informants contributed to both subcorpora by reciting sentences and providing answers to prompted questions. 
+The Tunisian_MSA corpus has 11.2 hours of speech.
+
+With the exception of speech from two speakers , all the corpus was used for training.
+
+A small corpus was collected for testing.
+
+A pronunciation dictionary is also available from openslrm.org.
+It covers all the words uttered in the Tunisian_MSA corpus and the test corpus.
+The QCRI lexicon was used as a starting point for writing this lexicon.
+The phones are the same as those used in the QCRI lexicon.
diff --git a/egs/tunisian_msa/s5/cmd.sh b/egs/tunisian_msa/s5/cmd.sh
@@ -0,0 +1,15 @@
+# you can change cmd.sh depending on what type of queue you are using.
+# If you have no queueing system and want to run on a local machine, you
+# can change all instances 'queue.pl' to run.pl (but be careful and run
+# commands one by one: most recipes will exhaust the memory on your
+# machine). queue.pl works with GridEngine (qsub). slurm.pl works
+# with slurm. Different queues are configured differently, with different
+# queue names and different ways of specifying things like memory;
+# to account for these differences you can create and edit the file
+# conf/queue.conf to match your queue's configuration. Search for
+# conf/queue.conf in https://kaldi-asr.org/doc/queue.html for more information,
+# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl.
+
+export train_cmd="queue.pl --mem 2G"
+export decode_cmd="queue.pl --mem 4G"
+export mkgraph_cmd="queue.pl --mem 8G"
diff --git a/egs/tunisian_msa/s5/conf/mfcc.conf b/egs/tunisian_msa/s5/conf/mfcc.conf
@@ -0,0 +1 @@
+--use-energy=false # only non-default option.
diff --git a/egs/tunisian_msa/s5/conf/mfcc_hires.conf b/egs/tunisian_msa/s5/conf/mfcc_hires.conf
@@ -0,0 +1,10 @@
+# config for high-resolution MFCC features, intended for neural network training
+# Note: we keep all cepstra, so it has the same info as filterbank features,
+# but MFCC is more easily compressible (because less correlated) which is why 
+# we prefer this method.
+--use-energy=false # use average of log energy, not energy.
+--num-mel-bins=40 # similar to Google's setup.
+--num-ceps=40 # there is no dimensionality reduction.
+--low-freq=20 # low cutoff frequency for mel bins... this is high-bandwidth data, so
+ # there might be some information at the low end.
+--high-freq=-400 # high cutoff frequently, relative to Nyquist of 8000 (=7600) 
diff --git a/egs/tunisian_msa/s5/conf/online_cmvn.conf b/egs/tunisian_msa/s5/conf/online_cmvn.conf
@@ -0,0 +1 @@
+# configuration file for apply-cmvn-online, used in the script ../local/run_online_decoding.sh
diff --git a/egs/tunisian_msa/s5/conf/pitch.conf b/egs/tunisian_msa/s5/conf/pitch.conf
@@ -0,0 +1 @@
+--sample-frequency=16000
diff --git a/egs/tunisian_msa/s5/conf/plp.conf b/egs/tunisian_msa/s5/conf/plp.conf
@@ -0,0 +1 @@
+--sample-frequency=16000
diff --git a/egs/tunisian_msa/s5/local/answers_make_lists.pl b/egs/tunisian_msa/s5/local/answers_make_lists.pl
@@ -0,0 +1,77 @@
+#!/usr/bin/env perl
+
+# Copyright 2018 John Morgan
+# Apache 2.0.
+
+# answers_make_lists.pl - make acoustic model training lists
+
+use strict;
+use warnings;
+use Carp;
+
+use File::Spec;
+use File::Copy;
+use File::Basename;
+
+my $tmpdir = 'data/local/tmp/tunis';
+
+system "mkdir -p $tmpdir/answers";
+
+# input wav file list
+my $wav_list = "$tmpdir/answers_wav.txt";
+
+# output temporary wav.scp files
+my $wav_scp = "$tmpdir/answers/wav.scp";
+
+# output temporary utt2spk files
+my $u = "$tmpdir/answers/utt2spk";
+
+# output temporary text files
+my $t = "$tmpdir/answers/text";
+
+# initialize hash for prompts
+my %prompt = ();
+
+# store prompts in hash
+LINEA: while ( my $line = <> ) {
+ chomp $line;
+ my ($num,$sent) = split /\t/sxm, $line, 2;
+
+ my ($machine,$s,$mode,$language,$i) = split /\_/sxm, $num;
+ # the utterance name
+ my $utt = $machine . '_' . $s . '_' . 'a' . '_' . $i;
+ $prompt{$utt} = $sent;
+}
+
+# Write wav.scp, utt2spk and text files.
+open my $W, '<', $wav_list or croak "problem with $wav_list $!";
+open my $O, '+>', $wav_scp or croak "problem with $wav_scp $!";
+open my $U, '+>', $u or croak "problem with $u";
+open my $T, '+>', $t or croak "problem with $t";
+
+ LINE: while ( my $line = <$W> ) {
+ chomp $line;
+ next LINE if ( $line !~ /Answers/sxm );
+ next LINE if ( $line =~ /Recordings/sxm );
+ my ($volume,$directories,$file) = File::Spec->splitpath( $line );
+ my @dirs = split /\//sxm, $directories;
+ my $r = basename $line, '.wav';
+ my $machine = $dirs[-3];
+ my $s = $dirs[-1];
+ my $rid = $machine . '_' . $s . '_' . 'a' . '_' . $r;
+ if ( exists $prompt{$rid} ) {
+ print ${T} "$rid\t$prompt{$rid}\n" or croak;
+ } elsif ( defined $rid ) {
+ print STDERR "problem\t$rid" or croak;
+ next LINE;
+ } else {
+ croak "$line";
+ }
+
+ print ${O} "$rid sox $line -t wav - |\n" or croak;
+ print ${U} "$rid ${machine}_${s}_a\n" or croak;
+}
+close $U or croak;
+close $T or croak;
+close $W or croak;
+close $O or croak;