new format data support ds2/st

PaddlePaddle · Nov 22, 2021 · b944418 · b944418
1 parent 02c7ef3
commit b944418
Show file tree

Hide file tree

Showing 4 changed files with 15 additions and 11 deletions.
diff --git a/examples/dataset/ted_en_zh/ted_en_zh.py b/examples/dataset/ted_en_zh/ted_en_zh.py
@@ -72,14 +72,17 @@ def create_manifest(data_dir, manifest_path_prefix):
  continue
  audio_data, samplerate = soundfile.read(audio_path)
  duration = float(len(audio_data) / samplerate)
+
+
+ translation_str = " ".join(translation.split())
+ trancription_str = " ".join(trancription.split())
  json_lines.append(
  json.dumps(
  {
  'utt': utt,
  'feat': audio_path,
  'feat_shape': (duration, ), # second
- 'text': " ".join(translation.split()),
- 'text1': " ".join(trancription.split())
+ 'text': [translation_str, trancription_str], 
  },
  ensure_ascii=False))
 

diff --git a/examples/ted_en_zh/t0/local/data.sh b/examples/ted_en_zh/t0/local/data.sh
@@ -9,7 +9,7 @@ stop_stage=100
 nbpe=8000
 bpemode=unigram
 bpeprefix="data/bpe_${bpemode}_${nbpe}"
-data_dir=./TED_EnZh
+data_dir=./TED-En-Zh
 
 
 source ${MAIN_ROOT}/utils/parse_options.sh
@@ -21,7 +21,7 @@ mkdir -p data
 
 if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
  if [ ! -e ${data_dir} ]; then
- echo "Error: Dataset is not avaiable. Please download and unzip the dataset"
+ echo "Error: ${data_dir} Dataset is not avaiable. Please download and unzip the dataset"
  echo "Download Link: https://pan.baidu.com/s/18L-59wgeS96WkObISrytQQ Passwd: bva0"
  echo "The tree of the directory should be:"
  echo "."
@@ -88,7 +88,7 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
  # format manifest with tokenids, vocab size
  for set in train dev test; do
  {
- python3 ${MAIN_ROOT}/utils/format_triplet_data.py \
+ python3 ${MAIN_ROOT}/utils/format_data.py \
  --cmvn_path "data/mean_std.json" \
  --unit_type "spm" \
  --spm_model_prefix ${bpeprefix} \

diff --git a/paddlespeech/s2t/io/collator.py b/paddlespeech/s2t/io/collator.py
@@ -237,8 +237,8 @@ def __call__(self, batch):
  for idx, item in enumerate(batch):
  utts.append(item['utt'])
 
- audio = item['feat']
- text = item['text']
+ audio = item['input'][0]['feat']
+ text = item['output'][0]['text']
  audio, text = self.process_utterance(audio, text)
 
  audios.append(audio) # [T, D]
@@ -381,9 +381,10 @@ def __call__(self, batch):
  for idx, item in enumerate(batch):
  utts.append(item['utt'])
 
- audio = item['feat']
- translation = item['text']
- transcription = item['text1']
+ audio = item['input'][0]['feat']
+ translation = item['output'][0]['text']
+ transcription = item['output'][1]['text']
+
  audio, translation, transcription = self.process_utterance(
  audio, translation, transcription)
 

diff --git a/paddlespeech/s2t/io/dataset.py b/paddlespeech/s2t/io/dataset.py
@@ -122,7 +122,7 @@ def __init__(self,
  min_output_len=min_output_len,
  max_output_input_ratio=max_output_input_ratio,
  min_output_input_ratio=min_output_input_ratio)
- self._manifest.sort(key=lambda x: x["feat_shape"][0])
+ self._manifest.sort(key=lambda x: x["input"][0]["shape"][0])
 
  def __len__(self):
  return len(self._manifest)