66import torchaudio
77import re
88
9- def split_sentence (text , min_len = 0 , language_str = 'EN' ):
9+ def split_sentence (text , min_len = 10 , language_str = 'EN' ):
1010 if language_str in ['EN' , 'FR' , 'ES' , 'SP' ]:
1111 sentences = split_sentences_latin (text , min_len = min_len )
1212 else :
1313 sentences = split_sentences_zh (text , min_len = min_len )
1414 return sentences
1515
1616
17- def split_sentences_latin (text , min_len = 0 ):
17+ def split_sentences_latin (text , min_len = 10 ):
1818 text = re .sub ('[。!?;]' , '.' , text )
1919 text = re .sub ('[,]' , ',' , text )
2020 text = re .sub ('[“”]' , '"' , text )
@@ -23,7 +23,7 @@ def split_sentences_latin(text, min_len=0):
2323 return [item .strip () for item in txtsplit (text , 256 , 512 ) if item .strip ()]
2424
2525
26- def split_sentences_zh (text , min_len = 0 ):
26+ def split_sentences_zh (text , min_len = 10 ):
2727 text = re .sub ('[。!?;]' , '.' , text )
2828 text = re .sub ('[,]' , ',' , text )
2929 # 将文本中的换行符、空格和制表符替换为空格
@@ -171,4 +171,4 @@ def commit():
171171 print (split_sentence (zh_text , language_str = 'ZH' ))
172172 print (split_sentence (en_text , language_str = 'EN' ))
173173 print (split_sentence (sp_text , language_str = 'SP' ))
174- print (split_sentence (fr_text , language_str = 'FR' ))
174+ print (split_sentence (fr_text , language_str = 'FR' ))
0 commit comments