Skip to content

Commit

Permalink
add greek char and fix issue2571 (#2683)
Browse files Browse the repository at this point in the history
Co-authored-by: TianYuan <[email protected]>
  • Loading branch information
david-95 and yt605155624 committed Nov 28, 2022
1 parent 58309aa commit bd01bc1
Showing 1 changed file with 28 additions and 3 deletions.
31 changes: 28 additions & 3 deletions paddlespeech/t2s/frontend/zh_normalization/text_normlization.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ def _split(self, text: str, lang="zh") -> List[str]:
if lang == "zh":
text = text.replace(" ", "")
# 过滤掉特殊字符
text = re.sub(r'[《》【】<=>{}()()#&@“”^_|…\\]', '', text)
text = re.sub(r'[——《》【】<=>{}()()#&@“”^_|…\\]', '', text)
text = self.SENTENCE_SPLITOR.sub(r'\1\n', text)
text = text.strip()
sentences = [sentence.strip() for sentence in re.split(r'\n+', text)]
Expand All @@ -85,7 +85,33 @@ def _post_replace(self, sentence: str) -> str:
sentence = sentence.replace('⑧', '八')
sentence = sentence.replace('⑨', '九')
sentence = sentence.replace('⑩', '十')

sentence = sentence.replace('α', '阿尔法')
sentence = sentence.replace('β', '贝塔')
sentence = sentence.replace('γ', '伽玛').replace('Γ', '伽玛')
sentence = sentence.replace('δ', '德尔塔').replace('Δ', '德尔塔')
sentence = sentence.replace('ε', '艾普西龙')
sentence = sentence.replace('ζ', '捷塔')
sentence = sentence.replace('η', '依塔')
sentence = sentence.replace('θ', '西塔').replace('Θ', '西塔')
sentence = sentence.replace('ι', '艾欧塔')
sentence = sentence.replace('κ', '喀帕')
sentence = sentence.replace('λ', '拉姆达').replace('Λ', '拉姆达')
sentence = sentence.replace('μ', '缪')
sentence = sentence.replace('ν', '拗')
sentence = sentence.replace('ξ', '克西').replace('Ξ', '克西')
sentence = sentence.replace('ο', '欧米克伦')
sentence = sentence.replace('π', '派').replace('Π', '派')
sentence = sentence.replace('ρ', '肉')
sentence = sentence.replace('ς', '西格玛').replace('Σ', '西格玛').replace(
'σ', '西格玛')
sentence = sentence.replace('τ', '套')
sentence = sentence.replace('υ', '宇普西龙')
sentence = sentence.replace('φ', '服艾').replace('Φ', '服艾')
sentence = sentence.replace('χ', '器')
sentence = sentence.replace('ψ', '普赛').replace('Ψ', '普赛')
sentence = sentence.replace('ω', '欧米伽').replace('Ω', '欧米伽')
# re filter special characters, have one more character "-" than line 68
sentence = re.sub(r'[-——《》【】<=>{}()()#&@“”^_|…\\]', '', sentence)
return sentence

def normalize_sentence(self, sentence: str) -> str:
Expand Down Expand Up @@ -124,6 +150,5 @@ def normalize_sentence(self, sentence: str) -> str:

def normalize(self, text: str) -> List[str]:
sentences = self._split(text)

sentences = [self.normalize_sentence(sent) for sent in sentences]
return sentences

0 comments on commit bd01bc1

Please sign in to comment.