add comments, check for pure model in the constructor of ner and cate…

…gorizer, remove unnecessary temp variables
snowlord · Jan 31, 2017 · 32c1f71 · 32c1f71
1 parent 84cbbf4
commit 32c1f71
Show file tree

Hide file tree

Showing 11 changed files with 388 additions and 115 deletions.
diff --git a/examples/python/text_categorizer_multiple_models.py b/examples/python/text_categorizer_multiple_models.py
diff --git a/examples/python/text_categorizers_with_shared_feature_extractor.py b/examples/python/text_categorizers_with_shared_feature_extractor.py
@@ -0,0 +1,77 @@
+#!/usr/bin/python
+#
+#    This example shows that a feature extractor can be shared between two
+#    text categorizers
+#
+import sys
+import os
+
+# Make sure you put the mitielib folder into the python search path.  There are
+# a lot of ways to do this, here we do it programmatically with the following
+# two statements:
+parent = os.path.dirname(os.path.realpath(__file__))
+sys.path.insert(0, parent + '/../../mitielib')
+
+from mitie import *
+
+def run_example():
+
+    fe_filename= "../../MITIE-models/english/total_word_feature_extractor.dat"
+    trainer1 = text_categorizer_trainer(fe_filename)
+
+    # Don't forget to add the training data.  Here we have only two examples, but for real
+    # uses you need to have thousands. You could also pass whole sentences in to the tokenize() function
+    # to get the tokens.
+    trainer1.add_labeled_text(["I","am","so","happy","and","exciting","to","make","this"],"positive")
+    trainer1.add_labeled_text(["What","a","black","and","bad","day"],"negative")
+
+    # The trainer can take advantage of a multi-core CPU.  So set the number of threads
+    # equal to the number of processing cores for maximum training speed.
+    trainer1.num_threads = 4
+
+    # This function does the work of training.  Note that it can take a long time to run
+    # when using larger training datasets.  So be patient.
+    cat1 = trainer1.train()
+
+    # Now that training is done we can save the categorizer object to disk like so.
+    # In pure_model mode we do not include a copy of the feature extractor.
+    cat1.save_to_disk("new_text_categorizer_1_pure_model.dat",pure_model=True)
+
+    # Now train another categorizer and save it as pure model
+    trainer2 = text_categorizer_trainer(fe_filename)
+    trainer2.add_labeled_text(tokenize("Recharge my phone"),"positive")
+    trainer2.add_labeled_text(tokenize("Recharge my number"),"positive")
+    trainer2.add_labeled_text(tokenize("I want to recharge my phone"),"positive")
+    trainer2.add_labeled_text(tokenize("Cancel"),"negative")
+    trainer2.add_labeled_text(tokenize("Finish"),"negative")
+    trainer2.add_labeled_text(tokenize("Close"),"negative")
+    trainer2.num_threads = 4
+    cat2 = trainer2.train()
+    cat2.save_to_disk("new_text_categorizer_2_pure_model.dat",pure_model=True)
+
+    # Load a feature extractor which will be shared between two categorizers
+    text_feature_extractor = total_word_feature_extractor(fe_filename)
+
+    # Load the two pure-model categorizers
+    cat1_new = text_categorizer("new_text_categorizer_1_pure_model.dat")
+    cat2_new = text_categorizer("new_text_categorizer_2_pure_model.dat")
+
+    # Now use the feature extractor to categorize the text.
+    # Observe that we are passing the same feature extractor to both the categorizers
+
+    text1 = "I am so happy"
+    pred, conf = cat1_new(tokenize(text1), text_feature_extractor)
+    print ("predict sentiment of text '{0}' to be {1} with confidence {2}".format(text1, pred, conf))
+
+    text2 = "Can you recharge my phone?"
+    pred2, conf2 = cat2_new(tokenize(text2), text_feature_extractor)
+    print ("predict sentiment of text '{0}' to be {1} with confidence {2}".format(text2, pred2, conf2))
+
+    text3 = "Stop"
+    pred3, conf3 = cat2_new(tokenize(text3), text_feature_extractor)
+    print ("predict sentiment of text '{0}' to be {1} with confidence {2}".format(text3, pred3, conf3))
+
+
+if __name__ == '__main__':
+    run_example()
+
diff --git a/mitielib/include/mitie.h b/mitielib/include/mitie.h
@@ -151,12 +151,40 @@ extern "C"
             - The returned object MUST BE FREED by a call to mitie_free().
             - If the object can't be created then this function returns NULL.
     !*/
+
     typedef struct mitie_total_word_feature_extractor mitie_total_word_feature_extractor;
+
+    MITIE_EXPORT int mitie_check_ner_pure_model(
+        const char* filename
+    );
+    /*!
+        requires
+            - filename == a valid pointer to a NULL terminated C string
+        ensures
+            - Reads a saved MITIE ner object from disk and checks whether it is
+              a pure model or not.
+            - filename must point to a serialized ner object
+              which was saved using mitie_save_named_entity_extractor_pure_model
+              or mitie_save_named_entity_extractor.
+            - returns 0 if its a pure model and a non-zero value otherwise
+    !*/
+
     MITIE_EXPORT mitie_named_entity_extractor* mitie_load_named_entity_extractor_pure_model_without_feature_extractor (
         const char* filename
     );
     /*!
-	Added for Rasa_nlu multitenancy
+        requires
+            - filename == a valid pointer to a NULL terminated C string
+        ensures
+            - Reads a saved MITIE named entity extractor from disk and returns a pointer to
+              the entity extractor object.
+            - filename must point to a serialized named_entity_extractor object
+              which was saved using mitie_save_named_entity_extractor_pure_model.
+            - Since the object contains just the model, subsequent calls to extract entities
+              must provide a valid total_word_feature_extractor object. Specifically, use
+              mitie_extract_entities_with_extractor() instead of mitie_extract_entities()
+            - The returned object MUST BE FREED by a call to mitie_free().
+            - If the object can't be created then this function returns NULL.
     !*/
 
     MITIE_EXPORT unsigned long mitie_get_num_possible_ner_tags (
@@ -211,8 +239,22 @@ extern "C"
     MITIE_EXPORT mitie_named_entity_detections* mitie_extract_entities_with_extractor(
         const mitie_named_entity_extractor* ner,
         char** tokens,
-        const mitie_total_word_feature_extractor* fe_
+        const mitie_total_word_feature_extractor* fe
     );
+    /*!
+        requires
+            - ner != NULL
+            - tokens == An array of NULL terminated C strings.  The end of the array must
+              be indicated by a NULL value (i.e. exactly how mitie_tokenize() defines an
+              array of tokens).
+            - fe != NULL; Pointer to a valid mitie_total_word_feature_extractor object. This must
+              be the same feature extractor which was used when creating the ner model.
+        ensures
+            - The returned object MUST BE FREED by a call to mitie_free().
+            - Runs the supplied named entity extractor on the tokenized text and returns a
+              set of named entity detections.
+            - If the object can't be created then this function returns NULL
+    !*/
 
     MITIE_EXPORT unsigned long mitie_ner_get_num_detections (
         const mitie_named_entity_detections* dets
@@ -450,11 +492,33 @@ extern "C"
             - If the object can't be created then this function returns NULL.
     !*/
 
-    MITIE_EXPORT mitie_text_categorizer* mitie_load_text_categorizer_pure_model_without_feature_extractor(
+    MITIE_EXPORT int mitie_check_text_categorizer_pure_model(
         const char* filename
     );
     /*!
-        Added for Rasa_nlu multitenancy
+        requires
+            - filename == a valid pointer to a NULL terminated C string
+        ensures
+            - Reads a saved MITIE text categorizer object from disk and checks whether it is
+              a pure model or not.
+            - filename must point to a serialized text_categorizer object
+              which was saved using mitie_save_text_categorizer_pure_model or mitie_save_text_categorizer.
+            - returns 0 if its a pure model and a non-zero value otherwise
+    !*/
+
+    mitie_text_categorizer* mitie_load_text_categorizer_pure_model_without_feature_extractor(
+        const char* filename
+    );
+    /*!
+        requires
+            - filename == a valid pointer to a NULL terminated C string
+        ensures
+            - Reads a saved MITIE text categorizer from disk and returns a pointer to
+              the categorizer object.
+            - filename must point to a serialized text_categorizer object
+              which was saved using mitie_save_text_categorizer_pure_model.
+            - The returned object MUST BE FREED by a call to mitie_free().
+            - If the object can't be created then this function returns NULL.
     !*/
 
     MITIE_EXPORT int mitie_categorize_text (
@@ -472,25 +536,47 @@ extern "C"
             - text_tag != NULL  
             - text_score != NULL        
         ensures
-          - This function uses a trained text_categorizer to predict the category of a text,
-            represented by an array of tokens, where each token is one word. The category is
-            represented by its name (a string).  
-          - returns 0 upon success and a non-zero value on failure.  
-          - text_tag MUST BE FREED by a call to mitie_free().
-          - if (this function returns 0) then
-              - *text_tag == A NULL terminated C string containing the predicted category
-                to which this text belongs (selected from the set of categories tcat knows
-                about)
-              - *score == the confidence the categorizer has about its prediction.
+            - This function uses a trained text_categorizer to predict the category of a text,
+              represented by an array of tokens, where each token is one word. The category is
+              represented by its name (a string).
+            - returns 0 upon success and a non-zero value on failure.
+            - text_tag MUST BE FREED by a call to mitie_free().
+            - if (this function returns 0) then
+                - *text_tag == A NULL terminated C string containing the predicted category
+                  to which this text belongs (selected from the set of categories tcat knows
+                  about)
+                - *score == the confidence the categorizer has about its prediction.
     !*/
 
     MITIE_EXPORT int mitie_categorize_text_with_extractor (
         const mitie_text_categorizer* tcat,
         const char** tokens,
         char** text_tag,
         double* text_score,
-        const mitie_total_word_feature_extractor* fe_
+        const mitie_total_word_feature_extractor* fe
     );
+    /*!
+        requires
+            - tcat != NULL
+            - tokens == An array of NULL terminated C strings.  The end of the array must
+              be indicated by a NULL value (i.e. exactly how mitie_tokenize() defines an
+              array of tokens).
+            - text_tag != NULL
+            - text_score != NULL
+            - fe != NULL This feature_extractor must be same as the one which was used
+              during creation of the text categorizer
+        ensures
+            - This function uses a trained text_categorizer to predict the category of a text,
+              represented by an array of tokens, where each token is one word. The category is
+              represented by its name (a string).
+            - returns 0 upon success and a non-zero value on failure.
+            - text_tag MUST BE FREED by a call to mitie_free().
+            - if (this function returns 0) then
+                - *text_tag == A NULL terminated C string containing the predicted category
+                  to which this text belongs (selected from the set of categories tcat knows
+                  about)
+                - *score == the confidence the categorizer has about its prediction.
+    !*/
 
 // ----------------------------------------------------------------------------------------
 // ----------------------------------------------------------------------------------------