Add methods to convert words to id

abhishekraok · web-flow · commit 5cd74b5ddc6d · 2017-02-08T14:55:07.000-08:00
diff --git a/.travis.yml b/.travis.yml
@@ -4,7 +4,7 @@ python:
 # command to install dependencies
 install:
   - pip install -r requirements.txt
-  - sudo mv data/nltk_data /usr/share/nltk_data    
+  - sudo mv nltk_data /usr/share/nltk_data    
 # command to run tests
 script: 
     - cd streampredictor
diff --git a/nltk_data/tokenizers/punkt/english.pickle b/nltk_data/tokenizers/punkt/english.pickle
diff --git a/requirements.txt b/requirements.txt
@@ -1,5 +1,5 @@
-nltk==3.2.2
-numpy==1.12.0
-protobuf==3.2.0
-matplotlib==2.0.0
-progressbar2==3.12.0
+nltk
+numpy
+protobuf
+matplotlib
+progressbar
diff --git a/streampredictor/DataObtainer.py b/streampredictor/DataObtainer.py
@@ -44,14 +44,18 @@ def get_clean_words_from_file(file, max_input_length):
         text = opened_file.read()
         return nltk.word_tokenize(clean_text(text))[:max_input_length]
 
+def get_words_from_ptb(file, max_input_length):
+    with open(file) as opened_file:
+        text = opened_file.read().replace('\n', '')
+        return text.split(' ')[:max_input_length]
 
 def clean_text(text, max_input_length=10**10000):
     text = text.replace('\n', ' ')
     max_length = min(max_input_length, len(text))
     rotation = random.randint(0,max_length)
     text = text[rotation:max_length] + text[:rotation]
     # make sure to remove # for category separation
-    text = ''.join(e for e in text if e.isalnum() or e in '.?", ')
+    text = ''.join(e for e in text if e.isalnum() or e in '.?", <>')
     return text
 
 
@@ -60,7 +64,18 @@ def get_online_words(max_input_length):
     words = nltk.word_tokenize(clean_text(text, max_input_length))
     return words
 
+def convert_words_to_id(words):
+    """
+    Converts words list to id list and returns id sequence, word2id and id2word dictionary.
+    """
+    unique_words = list(set(words))
+    id2word = dict((id,word) for id,word in enumerate(unique_words))
+    word2id = dict((i,j) for j,i in id2word.iteritems())
+    id_sequence = [word2id[word] for word in words]
+    return id_sequence, word2id, id2word
 
 if __name__ == '__main__':
-    text = get_random_book_local('../data')
-    print(text)
+    words = get_words_from_ptb('../data/ptb.test.txt', max_input_length=100)
+    print(words)
+    seq, word2id, id2word = convert_words_to_id(words)
+    print(seq)
diff --git a/streampredictor/test_word_predictor.py b/streampredictor/test_word_predictor.py
@@ -1,6 +1,7 @@
 from unittest import TestCase
 
 from . import WordPredictor
+from . import DataObtainer
 
 training_text = 'cat hat mat bat sat in the barn'
 words = training_text.split(' ')
@@ -18,3 +19,12 @@ def test_generates_sample(self):
         wp.train(words)
         generated_text = wp.generate(5)
         self.assertGreater(len(generated_text.split(' ')), 5)
+
+
+class TestDataObtainer(TestCase):
+    def test_convert_word_2_id(self):
+        test_words = ['aaa', 'bbb', 'ccc', 'aaa']
+        seq, word2id, id2word = DataObtainer.convert_words_to_id(test_words)
+        self.assertEqual([0,1,2,0], seq)
+        for word in test_words:
+            self.assertEqual(word, id2word[word2id[word]])