From 1c3d2cb89ef2d3bbef9c166980aa30d48d205134 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?David=20An=20=EF=BC=88An=20Hongliang=EF=BC=89?= Date: Mon, 21 Nov 2022 15:25:10 +0800 Subject: [PATCH] add double byte char for zh normalization (#2661) --- .../t2s/frontend/zh_normalization/constants.py | 6 +++--- .../frontend/zh_normalization/text_normlization.py | 11 +++++++++++ 2 files changed, 14 insertions(+), 3 deletions(-) diff --git a/paddlespeech/t2s/frontend/zh_normalization/constants.py b/paddlespeech/t2s/frontend/zh_normalization/constants.py index 5d2b0b34ea3..6423ad74a5c 100644 --- a/paddlespeech/t2s/frontend/zh_normalization/constants.py +++ b/paddlespeech/t2s/frontend/zh_normalization/constants.py @@ -19,7 +19,7 @@ # 全角半角转换 # 英文字符全角 -> 半角映射表 (num: 52) F2H_ASCII_LETTERS = { - chr(ord(char) + 65248): char + ord(char) + 65248: ord(char) for char in string.ascii_letters } @@ -27,12 +27,12 @@ H2F_ASCII_LETTERS = {value: key for key, value in F2H_ASCII_LETTERS.items()} # 数字字符全角 -> 半角映射表 (num: 10) -F2H_DIGITS = {chr(ord(char) + 65248): char for char in string.digits} +F2H_DIGITS = {ord(char) + 65248: ord(char) for char in string.digits} # 数字字符半角 -> 全角映射表 H2F_DIGITS = {value: key for key, value in F2H_DIGITS.items()} # 标点符号全角 -> 半角映射表 (num: 32) -F2H_PUNCTUATIONS = {chr(ord(char) + 65248): char for char in string.punctuation} +F2H_PUNCTUATIONS = {ord(char) + 65248: ord(char) for char in string.punctuation} # 标点符号半角 -> 全角映射表 H2F_PUNCTUATIONS = {value: key for key, value in F2H_PUNCTUATIONS.items()} diff --git a/paddlespeech/t2s/frontend/zh_normalization/text_normlization.py b/paddlespeech/t2s/frontend/zh_normalization/text_normlization.py index 8f8e3b07d12..1942e666126 100644 --- a/paddlespeech/t2s/frontend/zh_normalization/text_normlization.py +++ b/paddlespeech/t2s/frontend/zh_normalization/text_normlization.py @@ -74,6 +74,17 @@ def _split(self, text: str, lang="zh") -> List[str]: def _post_replace(self, sentence: str) -> str: sentence = sentence.replace('/', '每') sentence = sentence.replace('~', '至') + sentence = sentence.replace('~', '至') + sentence = sentence.replace('①', '一') + sentence = sentence.replace('②', '二') + sentence = sentence.replace('③', '三') + sentence = sentence.replace('④', '四') + sentence = sentence.replace('⑤', '五') + sentence = sentence.replace('⑥', '六') + sentence = sentence.replace('⑦', '七') + sentence = sentence.replace('⑧', '八') + sentence = sentence.replace('⑨', '九') + sentence = sentence.replace('⑩', '十') return sentence