Skip to content

Commit

Permalink
add comments, check for pure model in the constructor of ner and cate…
Browse files Browse the repository at this point in the history
…gorizer, remove unnecessary temp variables
  • Loading branch information
vinvinod committed Jan 31, 2017
1 parent 84cbbf4 commit 32c1f71
Show file tree
Hide file tree
Showing 11 changed files with 388 additions and 115 deletions.
66 changes: 0 additions & 66 deletions examples/python/text_categorizer_multiple_models.py

This file was deleted.

77 changes: 77 additions & 0 deletions examples/python/text_categorizers_with_shared_feature_extractor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
#!/usr/bin/python
#
# This example shows that a feature extractor can be shared between two
# text categorizers
#
import sys
import os

# Make sure you put the mitielib folder into the python search path. There are
# a lot of ways to do this, here we do it programmatically with the following
# two statements:
parent = os.path.dirname(os.path.realpath(__file__))
sys.path.insert(0, parent + '/../../mitielib')

from mitie import *

def run_example():

fe_filename= "../../MITIE-models/english/total_word_feature_extractor.dat"
trainer1 = text_categorizer_trainer(fe_filename)

# Don't forget to add the training data. Here we have only two examples, but for real
# uses you need to have thousands. You could also pass whole sentences in to the tokenize() function
# to get the tokens.
trainer1.add_labeled_text(["I","am","so","happy","and","exciting","to","make","this"],"positive")
trainer1.add_labeled_text(["What","a","black","and","bad","day"],"negative")

# The trainer can take advantage of a multi-core CPU. So set the number of threads
# equal to the number of processing cores for maximum training speed.
trainer1.num_threads = 4

# This function does the work of training. Note that it can take a long time to run
# when using larger training datasets. So be patient.
cat1 = trainer1.train()

# Now that training is done we can save the categorizer object to disk like so.
# In pure_model mode we do not include a copy of the feature extractor.
cat1.save_to_disk("new_text_categorizer_1_pure_model.dat",pure_model=True)

# Now train another categorizer and save it as pure model
trainer2 = text_categorizer_trainer(fe_filename)
trainer2.add_labeled_text(tokenize("Recharge my phone"),"positive")
trainer2.add_labeled_text(tokenize("Recharge my number"),"positive")
trainer2.add_labeled_text(tokenize("I want to recharge my phone"),"positive")
trainer2.add_labeled_text(tokenize("Cancel"),"negative")
trainer2.add_labeled_text(tokenize("Finish"),"negative")
trainer2.add_labeled_text(tokenize("Close"),"negative")
trainer2.num_threads = 4
cat2 = trainer2.train()
cat2.save_to_disk("new_text_categorizer_2_pure_model.dat",pure_model=True)

# Load a feature extractor which will be shared between two categorizers
text_feature_extractor = total_word_feature_extractor(fe_filename)

# Load the two pure-model categorizers
cat1_new = text_categorizer("new_text_categorizer_1_pure_model.dat")
cat2_new = text_categorizer("new_text_categorizer_2_pure_model.dat")

# Now use the feature extractor to categorize the text.
# Observe that we are passing the same feature extractor to both the categorizers

text1 = "I am so happy"
pred, conf = cat1_new(tokenize(text1), text_feature_extractor)
print ("predict sentiment of text '{0}' to be {1} with confidence {2}".format(text1, pred, conf))

text2 = "Can you recharge my phone?"
pred2, conf2 = cat2_new(tokenize(text2), text_feature_extractor)
print ("predict sentiment of text '{0}' to be {1} with confidence {2}".format(text2, pred2, conf2))

text3 = "Stop"
pred3, conf3 = cat2_new(tokenize(text3), text_feature_extractor)
print ("predict sentiment of text '{0}' to be {1} with confidence {2}".format(text3, pred3, conf3))


if __name__ == '__main__':
run_example()

116 changes: 101 additions & 15 deletions mitielib/include/mitie.h
Original file line number Diff line number Diff line change
Expand Up @@ -151,12 +151,40 @@ extern "C"
- The returned object MUST BE FREED by a call to mitie_free().
- If the object can't be created then this function returns NULL.
!*/

typedef struct mitie_total_word_feature_extractor mitie_total_word_feature_extractor;

MITIE_EXPORT int mitie_check_ner_pure_model(
const char* filename
);
/*!
requires
- filename == a valid pointer to a NULL terminated C string
ensures
- Reads a saved MITIE ner object from disk and checks whether it is
a pure model or not.
- filename must point to a serialized ner object
which was saved using mitie_save_named_entity_extractor_pure_model
or mitie_save_named_entity_extractor.
- returns 0 if its a pure model and a non-zero value otherwise
!*/

MITIE_EXPORT mitie_named_entity_extractor* mitie_load_named_entity_extractor_pure_model_without_feature_extractor (
const char* filename
);
/*!
Added for Rasa_nlu multitenancy
requires
- filename == a valid pointer to a NULL terminated C string
ensures
- Reads a saved MITIE named entity extractor from disk and returns a pointer to
the entity extractor object.
- filename must point to a serialized named_entity_extractor object
which was saved using mitie_save_named_entity_extractor_pure_model.
- Since the object contains just the model, subsequent calls to extract entities
must provide a valid total_word_feature_extractor object. Specifically, use
mitie_extract_entities_with_extractor() instead of mitie_extract_entities()
- The returned object MUST BE FREED by a call to mitie_free().
- If the object can't be created then this function returns NULL.
!*/

MITIE_EXPORT unsigned long mitie_get_num_possible_ner_tags (
Expand Down Expand Up @@ -211,8 +239,22 @@ extern "C"
MITIE_EXPORT mitie_named_entity_detections* mitie_extract_entities_with_extractor(
const mitie_named_entity_extractor* ner,
char** tokens,
const mitie_total_word_feature_extractor* fe_
const mitie_total_word_feature_extractor* fe
);
/*!
requires
- ner != NULL
- tokens == An array of NULL terminated C strings. The end of the array must
be indicated by a NULL value (i.e. exactly how mitie_tokenize() defines an
array of tokens).
- fe != NULL; Pointer to a valid mitie_total_word_feature_extractor object. This must
be the same feature extractor which was used when creating the ner model.
ensures
- The returned object MUST BE FREED by a call to mitie_free().
- Runs the supplied named entity extractor on the tokenized text and returns a
set of named entity detections.
- If the object can't be created then this function returns NULL
!*/

MITIE_EXPORT unsigned long mitie_ner_get_num_detections (
const mitie_named_entity_detections* dets
Expand Down Expand Up @@ -450,11 +492,33 @@ extern "C"
- If the object can't be created then this function returns NULL.
!*/

MITIE_EXPORT mitie_text_categorizer* mitie_load_text_categorizer_pure_model_without_feature_extractor(
MITIE_EXPORT int mitie_check_text_categorizer_pure_model(
const char* filename
);
/*!
Added for Rasa_nlu multitenancy
requires
- filename == a valid pointer to a NULL terminated C string
ensures
- Reads a saved MITIE text categorizer object from disk and checks whether it is
a pure model or not.
- filename must point to a serialized text_categorizer object
which was saved using mitie_save_text_categorizer_pure_model or mitie_save_text_categorizer.
- returns 0 if its a pure model and a non-zero value otherwise
!*/

mitie_text_categorizer* mitie_load_text_categorizer_pure_model_without_feature_extractor(
const char* filename
);
/*!
requires
- filename == a valid pointer to a NULL terminated C string
ensures
- Reads a saved MITIE text categorizer from disk and returns a pointer to
the categorizer object.
- filename must point to a serialized text_categorizer object
which was saved using mitie_save_text_categorizer_pure_model.
- The returned object MUST BE FREED by a call to mitie_free().
- If the object can't be created then this function returns NULL.
!*/

MITIE_EXPORT int mitie_categorize_text (
Expand All @@ -472,25 +536,47 @@ extern "C"
- text_tag != NULL
- text_score != NULL
ensures
- This function uses a trained text_categorizer to predict the category of a text,
represented by an array of tokens, where each token is one word. The category is
represented by its name (a string).
- returns 0 upon success and a non-zero value on failure.
- text_tag MUST BE FREED by a call to mitie_free().
- if (this function returns 0) then
- *text_tag == A NULL terminated C string containing the predicted category
to which this text belongs (selected from the set of categories tcat knows
about)
- *score == the confidence the categorizer has about its prediction.
- This function uses a trained text_categorizer to predict the category of a text,
represented by an array of tokens, where each token is one word. The category is
represented by its name (a string).
- returns 0 upon success and a non-zero value on failure.
- text_tag MUST BE FREED by a call to mitie_free().
- if (this function returns 0) then
- *text_tag == A NULL terminated C string containing the predicted category
to which this text belongs (selected from the set of categories tcat knows
about)
- *score == the confidence the categorizer has about its prediction.
!*/

MITIE_EXPORT int mitie_categorize_text_with_extractor (
const mitie_text_categorizer* tcat,
const char** tokens,
char** text_tag,
double* text_score,
const mitie_total_word_feature_extractor* fe_
const mitie_total_word_feature_extractor* fe
);
/*!
requires
- tcat != NULL
- tokens == An array of NULL terminated C strings. The end of the array must
be indicated by a NULL value (i.e. exactly how mitie_tokenize() defines an
array of tokens).
- text_tag != NULL
- text_score != NULL
- fe != NULL This feature_extractor must be same as the one which was used
during creation of the text categorizer
ensures
- This function uses a trained text_categorizer to predict the category of a text,
represented by an array of tokens, where each token is one word. The category is
represented by its name (a string).
- returns 0 upon success and a non-zero value on failure.
- text_tag MUST BE FREED by a call to mitie_free().
- if (this function returns 0) then
- *text_tag == A NULL terminated C string containing the predicted category
to which this text belongs (selected from the set of categories tcat knows
about)
- *score == the confidence the categorizer has about its prediction.
!*/

// ----------------------------------------------------------------------------------------
// ----------------------------------------------------------------------------------------
Expand Down
Loading

0 comments on commit 32c1f71

Please sign in to comment.