From dad1cbbcd6cfc8d2530de48cdff3b325b6d2de8c Mon Sep 17 00:00:00 2001 From: TianYuan Date: Fri, 26 Nov 2021 09:12:29 +0000 Subject: [PATCH] update text frontend --- demos/style_fs2/style_syn.py | 4 ++- examples/ljspeech/voc1/README.md | 2 +- .../t2s/exps/fastspeech2/inference.py | 4 ++- .../fastspeech2/multi_spk_synthesize_e2e.py | 4 ++- .../t2s/exps/fastspeech2/synthesize_e2e.py | 4 ++- .../exps/fastspeech2/synthesize_e2e_melgan.py | 4 ++- .../t2s/exps/speedyspeech/inference.py | 4 ++- .../t2s/exps/speedyspeech/synthesize_e2e.py | 4 ++- paddlespeech/t2s/frontend/zh_frontend.py | 7 ++++- .../frontend/zh_normalization/chronology.py | 26 +++++++++++++++++++ .../frontend/zh_normalization/phonecode.py | 7 +++-- .../zh_normalization/text_normlization.py | 10 +++++++ .../t2s/models/fastspeech2/fastspeech2.py | 2 +- 13 files changed, 70 insertions(+), 12 deletions(-) diff --git a/demos/style_fs2/style_syn.py b/demos/style_fs2/style_syn.py index 5b8ce35139a..9bd61579000 100644 --- a/demos/style_fs2/style_syn.py +++ b/demos/style_fs2/style_syn.py @@ -34,7 +34,9 @@ def evaluate(args, fastspeech2_config, pwg_config): sentences = [] with open(args.text, 'rt') as f: for line in f: - utt_id, sentence = line.strip().split() + items = line.strip().split() + utt_id = items[0] + sentence = ",".join(items[1:]) sentences.append((utt_id, sentence)) with open(args.phones_dict, "r") as f: diff --git a/examples/ljspeech/voc1/README.md b/examples/ljspeech/voc1/README.md index 13cc6ed7e46..3830156f9fe 100644 --- a/examples/ljspeech/voc1/README.md +++ b/examples/ljspeech/voc1/README.md @@ -137,4 +137,4 @@ pwg_ljspeech_ckpt_0.5 └── pwg_stats.npy # statistics used to normalize spectrogram when training parallel wavegan ``` ## Acknowledgement -We adapted some code from https://github.com/kan-bayashi/ParallelWaveGAN. \ No newline at end of file +We adapted some code from https://github.com/kan-bayashi/ParallelWaveGAN. diff --git a/paddlespeech/t2s/exps/fastspeech2/inference.py b/paddlespeech/t2s/exps/fastspeech2/inference.py index 07e9ed7ee33..8ea64b9934b 100644 --- a/paddlespeech/t2s/exps/fastspeech2/inference.py +++ b/paddlespeech/t2s/exps/fastspeech2/inference.py @@ -82,7 +82,9 @@ def main(): with open(args.text, 'rt') as f: for line in f: - utt_id, sentence = line.strip().split() + items = line.strip().split() + utt_id = items[0] + sentence = ",".join(items[1:]) sentences.append((utt_id, sentence)) for utt_id, sentence in sentences: diff --git a/paddlespeech/t2s/exps/fastspeech2/multi_spk_synthesize_e2e.py b/paddlespeech/t2s/exps/fastspeech2/multi_spk_synthesize_e2e.py index 1839415e978..a2f8ada69f3 100644 --- a/paddlespeech/t2s/exps/fastspeech2/multi_spk_synthesize_e2e.py +++ b/paddlespeech/t2s/exps/fastspeech2/multi_spk_synthesize_e2e.py @@ -37,7 +37,9 @@ def evaluate(args, fastspeech2_config, pwg_config): sentences = [] with open(args.text, 'rt') as f: for line in f: - utt_id, sentence = line.strip().split() + items = line.strip().split() + utt_id = items[0] + sentence = ",".join(items[1:]) sentences.append((utt_id, sentence)) with open(args.phones_dict, "r") as f: diff --git a/paddlespeech/t2s/exps/fastspeech2/synthesize_e2e.py b/paddlespeech/t2s/exps/fastspeech2/synthesize_e2e.py index ff9a41eabbb..aac2c054e21 100644 --- a/paddlespeech/t2s/exps/fastspeech2/synthesize_e2e.py +++ b/paddlespeech/t2s/exps/fastspeech2/synthesize_e2e.py @@ -40,7 +40,9 @@ def evaluate(args, fastspeech2_config, pwg_config): sentences = [] with open(args.text, 'rt') as f: for line in f: - utt_id, sentence = line.strip().split() + items = line.strip().split() + utt_id = items[0] + sentence = ",".join(items[1:]) sentences.append((utt_id, sentence)) with open(args.phones_dict, "r") as f: diff --git a/paddlespeech/t2s/exps/fastspeech2/synthesize_e2e_melgan.py b/paddlespeech/t2s/exps/fastspeech2/synthesize_e2e_melgan.py index f0ff5655dc1..527e5d41079 100644 --- a/paddlespeech/t2s/exps/fastspeech2/synthesize_e2e_melgan.py +++ b/paddlespeech/t2s/exps/fastspeech2/synthesize_e2e_melgan.py @@ -40,7 +40,9 @@ def evaluate(args, fastspeech2_config, melgan_config): sentences = [] with open(args.text, 'rt') as f: for line in f: - utt_id, sentence = line.strip().split() + items = line.strip().split() + utt_id = items[0] + sentence = ",".join(items[1:]) sentences.append((utt_id, sentence)) with open(args.phones_dict, "r") as f: diff --git a/paddlespeech/t2s/exps/speedyspeech/inference.py b/paddlespeech/t2s/exps/speedyspeech/inference.py index 617848c5845..75f937decdc 100644 --- a/paddlespeech/t2s/exps/speedyspeech/inference.py +++ b/paddlespeech/t2s/exps/speedyspeech/inference.py @@ -87,7 +87,9 @@ def main(): with open(args.text, 'rt') as f: for line in f: - utt_id, sentence = line.strip().split() + items = line.strip().split() + utt_id = items[0] + sentence = ",".join(items[1:]) sentences.append((utt_id, sentence)) for utt_id, sentence in sentences: diff --git a/paddlespeech/t2s/exps/speedyspeech/synthesize_e2e.py b/paddlespeech/t2s/exps/speedyspeech/synthesize_e2e.py index 0e64088dcd1..b0418940528 100644 --- a/paddlespeech/t2s/exps/speedyspeech/synthesize_e2e.py +++ b/paddlespeech/t2s/exps/speedyspeech/synthesize_e2e.py @@ -40,7 +40,9 @@ def evaluate(args, speedyspeech_config, pwg_config): sentences = [] with open(args.text, 'rt') as f: for line in f: - utt_id, sentence = line.strip().split() + items = line.strip().split() + utt_id = items[0] + sentence = ",".join(items[1:]) sentences.append((utt_id, sentence)) with open(args.phones_dict, "r") as f: diff --git a/paddlespeech/t2s/frontend/zh_frontend.py b/paddlespeech/t2s/frontend/zh_frontend.py index d49c09378a2..5b69477da6f 100644 --- a/paddlespeech/t2s/frontend/zh_frontend.py +++ b/paddlespeech/t2s/frontend/zh_frontend.py @@ -149,9 +149,14 @@ def _merge_erhua(self, if word not in self.must_erhua and (word in self.not_erhua or pos in {"a", "j", "nr"}): return initials, finals + # "……" 等情况直接返回 + if len(finals) != len(word): + return initials, finals + + assert len(finals) == len(word) + new_initials = [] new_finals = [] - assert len(finals) == len(word) for i, phn in enumerate(finals): if i == len(finals) - 1 and word[i] == "儿" and phn in { "er2", "er5" diff --git a/paddlespeech/t2s/frontend/zh_normalization/chronology.py b/paddlespeech/t2s/frontend/zh_normalization/chronology.py index b8d711564c9..8801baa0d5a 100644 --- a/paddlespeech/t2s/frontend/zh_normalization/chronology.py +++ b/paddlespeech/t2s/frontend/zh_normalization/chronology.py @@ -32,6 +32,15 @@ def _time_num2str(num_string: str) -> str: r':([0-5][0-9])' r'(:([0-5][0-9]))?') +# 时间范围,如8:30-12:30 +RE_TIME_RANGE = re.compile(r'([0-1]?[0-9]|2[0-3])' + r':([0-5][0-9])' + r'(:([0-5][0-9]))?' + r'(~|-)' + r'([0-1]?[0-9]|2[0-3])' + r':([0-5][0-9])' + r'(:([0-5][0-9]))?') + def replace_time(match) -> str: """ @@ -42,15 +51,32 @@ def replace_time(match) -> str: ---------- str """ + + is_range = len(match.groups()) > 5 + hour = match.group(1) minute = match.group(2) second = match.group(4) + if is_range: + hour_2 = match.group(6) + minute_2 = match.group(7) + second_2 = match.group(9) + result = f"{num2str(hour)}点" if minute.lstrip('0'): result += f"{_time_num2str(minute)}分" if second and second.lstrip('0'): result += f"{_time_num2str(second)}秒" + + if is_range: + result += "至" + result += f"{num2str(hour_2)}点" + if minute_2.lstrip('0'): + result += f"{_time_num2str(minute_2)}分" + if second_2 and second_2.lstrip('0'): + result += f"{_time_num2str(second_2)}秒" + return result diff --git a/paddlespeech/t2s/frontend/zh_normalization/phonecode.py b/paddlespeech/t2s/frontend/zh_normalization/phonecode.py index be159c2395d..b7b69b41b22 100644 --- a/paddlespeech/t2s/frontend/zh_normalization/phonecode.py +++ b/paddlespeech/t2s/frontend/zh_normalization/phonecode.py @@ -26,16 +26,19 @@ RE_TELEPHONE = re.compile( r"(? str: if mobile: sp_parts = phone_string.strip('+').split() - result = ''.join( + result = ','.join( [verbalize_digit(part, alt_one=True) for part in sp_parts]) return result else: sil_parts = phone_string.split('-') - result = ''.join( + result = ','.join( [verbalize_digit(part, alt_one=True) for part in sil_parts]) return result diff --git a/paddlespeech/t2s/frontend/zh_normalization/text_normlization.py b/paddlespeech/t2s/frontend/zh_normalization/text_normlization.py index e25e9901914..c3885fb9b4a 100644 --- a/paddlespeech/t2s/frontend/zh_normalization/text_normlization.py +++ b/paddlespeech/t2s/frontend/zh_normalization/text_normlization.py @@ -18,6 +18,7 @@ from .chronology import RE_DATE from .chronology import RE_DATE2 from .chronology import RE_TIME +from .chronology import RE_TIME_RANGE from .chronology import replace_date from .chronology import replace_date2 from .chronology import replace_time @@ -40,6 +41,7 @@ from .num import replace_positive_quantifier from .num import replace_range from .phonecode import RE_MOBILE_PHONE +from .phonecode import RE_NATIONAL_UNIFORM_NUMBER from .phonecode import RE_TELEPHONE from .phonecode import replace_mobile from .phonecode import replace_phone @@ -76,12 +78,19 @@ def normalize_sentence(self, sentence: str) -> str: # number related NSW verbalization sentence = RE_DATE.sub(replace_date, sentence) sentence = RE_DATE2.sub(replace_date2, sentence) + + # range first + sentence = RE_TIME_RANGE.sub(replace_time, sentence) sentence = RE_TIME.sub(replace_time, sentence) + sentence = RE_TEMPERATURE.sub(replace_temperature, sentence) sentence = RE_FRAC.sub(replace_frac, sentence) sentence = RE_PERCENTAGE.sub(replace_percentage, sentence) sentence = RE_MOBILE_PHONE.sub(replace_mobile, sentence) + sentence = RE_TELEPHONE.sub(replace_phone, sentence) + sentence = RE_NATIONAL_UNIFORM_NUMBER.sub(replace_phone, sentence) + sentence = RE_RANGE.sub(replace_range, sentence) sentence = RE_INTEGER.sub(replace_negative_num, sentence) sentence = RE_DECIMAL_NUM.sub(replace_number, sentence) @@ -94,5 +103,6 @@ def normalize_sentence(self, sentence: str) -> str: def normalize(self, text: str) -> List[str]: sentences = self._split(text) + sentences = [self.normalize_sentence(sent) for sent in sentences] return sentences diff --git a/paddlespeech/t2s/models/fastspeech2/fastspeech2.py b/paddlespeech/t2s/models/fastspeech2/fastspeech2.py index aa42a83dec7..cdec03abc15 100644 --- a/paddlespeech/t2s/models/fastspeech2/fastspeech2.py +++ b/paddlespeech/t2s/models/fastspeech2/fastspeech2.py @@ -307,7 +307,7 @@ def __init__( num_embeddings=idim, embedding_dim=adim, padding_idx=self.padding_idx) - + if encoder_type == "transformer": print("encoder_type is transformer") self.encoder = TransformerEncoder(