Fix #347: only create object UnicodeRegex when used. (#349)

qinzzz · qinzzz · hunterhector · web-flow · commit ade388b73774 · 2022-04-13T20:25:02.000-04:00
* fix #347: only create object UnicodeRegex when used. * fix #347: only create object UnicodeRegex when used. * use lru cache to generate object; update workflow * update requirements * Update setup.py Co-authored-by: qinzzz <wangqinxin2007@126.com> Co-authored-by: Hector <hunterhector@gmail.com>
diff --git a/requirements.txt b/requirements.txt
@@ -7,4 +7,3 @@ sentencepiece >= 0.1.8
 dill >= 0.3.3
 nni >= 2.0.0
 six >= 1.15
-
diff --git a/setup.py b/setup.py
@@ -38,18 +38,19 @@
         'sentencepiece>=0.1.96',
         'mypy_extensions',
         'packaging>=19.0',
-        'asyml-utilities>=0.0.1.dev1'
+        'six',
+        'asyml-utilities>=0.0.1.dev1',
     ],
     extras_require={
         'torch': ['torch>=1.0.0'],
         'examples': [],
         'extras': ['Pillow>=3.0', 'tensorboardX>=1.8', 'six>=1.15'],
-        'dist': ['adaptdl>=0.2.4']
+        'dist': ['adaptdl>=0.2.4'],
     },
     package_data={
         "texar.torch": [
             "../../bin/utils/multi-bleu.perl",
-        ]
+        ],
     },
     classifiers=[
         'Intended Audience :: Developers',
diff --git a/texar/torch/evals/bleu_transformer.py b/texar/torch/evals/bleu_transformer.py
@@ -23,6 +23,7 @@
 import unicodedata
 import collections
 import math
+import functools
 import numpy as np
 
 from texar.torch.evals.bleu import corpus_bleu
@@ -156,7 +157,9 @@ def property_chars(prefix):
         )
 
 
-uregex = UnicodeRegex()
+@functools.lru_cache(1)
+def _get_unicode_regex() -> UnicodeRegex:
+    return UnicodeRegex()
 
 
 def bleu_transformer_tokenize(string: str) -> List[str]:
@@ -188,6 +191,7 @@ def bleu_transformer_tokenize(string: str) -> List[str]:
     Returns:
         a list of tokens
     """
+    uregex = _get_unicode_regex()
     string = uregex.nondigit_punct_re.sub(r"\1 \2 ", string)
     string = uregex.punct_nondigit_re.sub(r" \1 \2", string)
     string = uregex.symbol_re.sub(r" \1 ", string)