replace nltk punkt with punkt_tab and update documentation and tests accordingly

erkankarabulut · erkankarabulut · commit be2f9abee5d2 · 2025-11-23T12:37:23.000+01:00
diff --git a/README.md b/README.md
@@ -342,7 +342,7 @@ print(f'Run time: {time:.2f}s')
 rules.to_csv('output.csv')
 ```
 
-**Note:** You may need to download stopwords and the punkt tokenizer from nltk by running `import nltk; nltk.download('stopwords'); nltk.download('punkt')`.
+**Note:** You may need to download stopwords and the punkt_tab tokenizer from nltk by running `import nltk; nltk.download('stopwords'); nltk.download('punkt_tab')`.
 
 For a full list of examples see the [examples folder](https://github.com/firefly-cpp/NiaARM/tree/main/examples)
 in the GitHub repository.
diff --git a/docs/getting_started.rst b/docs/getting_started.rst
@@ -285,7 +285,7 @@ added to the :mod:`niaarm.mine` module.
         print('No rules generated')
         print(f'Run time: {time:.2f}s')
 
-**Note:** You may need to download stopwords and the punkt tokenizer from nltk by running `import nltk; nltk.download('stopwords'); nltk.download('punkt')`.
+**Note:** You may need to download stopwords and the punkt_tab tokenizer from nltk by running `import nltk; nltk.download('stopwords'); nltk.download('punkt_tab')`.
 
 **Output:**
 
diff --git a/examples/text_mining.py b/examples/text_mining.py
@@ -7,13 +7,13 @@
 df = pd.read_json("datasets/text/artm_test_dataset.json", orient="records")
 documents = df["text"].tolist()
 
-# create a Corpus object from the documents (requires nltk's punkt tokenizer and the stopwords list)
+# create a Corpus object from the documents (requires nltk's punkt_tab tokenizer and the stopwords list)
 try:
     corpus = Corpus.from_list(documents)
 except LookupError:
     import nltk
 
-    nltk.download("punkt")
+    nltk.download("punkt_tab")
     nltk.download("stopwords")
     corpus = Corpus.from_list(documents)
 
diff --git a/tests/test_text_mining.py b/tests/test_text_mining.py
@@ -10,7 +10,7 @@
 
 class TestTextMining(TestCase):
     def setUp(self):
-        nltk.download("punkt")
+        nltk.download("punkt_tab")
         nltk.download("stopwords")
         ds_path = os.path.join(
             os.path.dirname(__file__), "test_data", "artm_test_dataset.json"