Merge pull request #2 from breznak/story-teller

Merged in Story Teller
chetan51 · Nov 17, 2013 · 93a6c84 · 93a6c84
2 parents 46607b7 + 086bc97
commit 93a6c84
Show file tree

Hide file tree

Showing 4 changed files with 92 additions and 19 deletions.
diff --git a/README.md b/README.md
@@ -1,7 +1,14 @@
 Linguist
 ========
 
-An AI running on NuPIC using the CLA to build a model of language, and predict the rest of a user's word, phrase, sentence.
+An AI running on [NuPIC](https://github.com/numenta/nupic) using the CLA to build a model of language, and predict the rest of a user's word, phrase, sentence.
+
+The current application works as a "story teller", it'll train on a dataset you give it and then produce a story for you. You can specify number of sentences the story 
+should have and the beginig words to kick off "There were two brothers ". 
+
+Research: 
+
+The CLA learns sentences letter-by-letter, we'd like to observe if grammatical features can emerge. See [Jeff Hinton's text-generation deep NNs](https://class.coursera.org/neuralnets-2012-001/lecture/91) for reference. 
 
 Usage
 ========
@@ -12,4 +19,4 @@ Then, run:
 
     python client/linguist.py data/tiny.txt
 
-It'll run on the tiny.txt dataset, feeding it the sentence over and over again, printing out Linguist's next 10 timestep predictions at each step.
+It'll run on the tiny.txt dataset, feeding it the sentence over and over again, printing out Linguist's next 10 timestep predictions at each step.
diff --git a/client/linguist.py b/client/linguist.py
@@ -30,9 +30,14 @@
 import model_params
 import re
 
-NUM_REPEATS = 1000
+NUM_REPEATS = 5
 PRINT_EVERY_REPEAT_N = 1
 
+TERMINATORS = ['.','!','?','|']
+NUM_SENTENCES = 5 # number for story sentences generated
+STORY_START = "The dog" 
+_QUIT = "QUIT"
+
 def clean(s):
   return re.sub('\n', '|', s)
 
@@ -74,30 +79,74 @@ def runLinguist(datapath):
     if should_print:
       print "\n====== Repeat #%d =======\n" % (r + 1)
 
-    last_c = ''
-
     with open(datapath) as f:
       while True:
         c = f.read(1)
         if not c: break
 
-        if ord(c) <= 31 and ord(c) >= 127 and ord(c) != 10: continue
-        if (last_c == ' ' or last_c == '\n') and c == last_c: continue
-
-        last_c = c
+        if not ( (ord(c) >= 31 and ord(c) <= 127) or c == '\n'): continue
 
         modelInput = {'letter': c}
         result = model.run(modelInput)
-
         if should_print:
           print "[%i]\t %s ==> %s\t(%s)" % (i, clean(modelInput['letter']), prediction(result.inferences), confidences(result.inferences))
+        if c in TERMINATORS:
+          model.resetSequenceStates()
+          print "reset"
 
         i += 1
 
+  return model
+
+def tellStory(model, startSent, lenght):
+  """feed startSent sequence and then continue to generate the story by the CLA. 
+    param model: the trained CLA model
+    param startSent: starting sequence as a string, eg \"And so he raised the gun\" 
+    param lenght: #sequences to generate. """
+
+  model.disableLearning()
+  for s in startSent:
+    print(s),
+    modelInput = {'letter': s}
+    result = model.run(modelInput)
+
+  numSent = 0
+  c = s
+  sentence_len = 0
+  while numSent <= lenght:
+    print(c),
+    modelInput = {'letter': c}
+    result = model.run(modelInput)
+    c=result.inferences['prediction'][0]
+
+    sentence_len += 1
+    if(sentence_len > 30): #limit, sometimes there's no sentence terminator generated and we'd run forever
+      numSent += 1
+      sentence_len = 0
+
+    if c in TERMINATORS:
+      numSent += 1
+      sentence_len = 0
+      print(' \n')
+
 
 if __name__ == "__main__":
   if len(sys.argv) > 1:
     datapath = sys.argv[1]
-    runLinguist(datapath)
+    if len(sys.argv) > 2:
+      NUM_REPEATS = int(sys.argv[2])
+      if len(sys.argv) > 3:
+        NUM_SENTENCES = int(sys.argv[3])
+
+    model = runLinguist(datapath)
+    print('==========================================')
+    print('Welcome young adventurer, let me tell you a story! ')
+    while True: 
+      STORY_START = raw_input('Enter story start (QUIT to go to work): ')
+      if(STORY_START == "QUIT"): 
+        break
+      tellStory(model, STORY_START, NUM_SENTENCES)
+
+    print('Farewell')
   else:
-    print "Usage: python linguist.py [path/to/data.txt]"
+    print "Usage: python linguist.py path/to/data.txt [num_repeats [num_sentences]]"
diff --git a/client/model_params.py b/client/model_params.py
@@ -131,13 +131,14 @@
             # threshold set below this default value.
             # (This concept applies to both SP and TP and so 'cells'
             # is correct here as opposed to 'columns')
-            'synPermConnected': 0.1,
+            'synPermConnected': 0.2,
 
             'synPermActiveInc': 0.1,
 
-            'synPermInactiveDec': 0.025910586208889513,
+            'synPermInactiveDec': 0.1,
 
             'randomSP': 0,
+            'spatialImp': 'cpp',
         },
 
         # Controls whether TP is enabled or disabled;
@@ -158,7 +159,7 @@
             'columnCount': 2048,
 
             # The number of cells (i.e., states), allocated per column.
-            'cellsPerColumn': 32,
+            'cellsPerColumn': 8,
 
             'inputWidth': 2048,
 
@@ -227,7 +228,7 @@
             # elements to append to the end of a learned sequence at a time.
             # Smaller values are better for datasets with short sequences,
             # higher values are better for datasets with long sequences.
-            'pamLength': 2,
+            'pamLength': 5,
         },
 
         'clParams': {
@@ -239,13 +240,14 @@
 
             # This controls how fast the classifier learns/forgets. Higher values
             # make it adapt faster and forget older patterns faster.
-            'alpha': 0.00080959960713530062,
+            'alpha': 0.01080959960713530062,
 
             # This is set after the call to updateConfigFromSubConfig and is
             # computed from the aggregationInfo and predictAheadTime.
-            'steps': '1,2,3,4,5,6,7,8,9,10',
+            #'steps': '1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16',
+            'steps': '1,2,3,4,5',
         },
 
-        'trainSPNetOnlyIfRequested': False,
+        'trainSPNetOnlyIfRequested': True,
     },
 }
diff --git a/data/grammar/grammar_likes.txt b/data/grammar/grammar_likes.txt
@@ -0,0 +1,15 @@
+We like gramar. 
+We love grammar. 
+John loves grammar. 
+We run.
+John runs.
+We sing.
+John sings.
+We sit.
+John sits.
+We dig.
+John digs.
+We rock.
+John rocks.
+We spend time.
+John spends time.