Merge pull request RasaHQ#4743 from RasaHQ/fix_intent_featurizer

Ghostvv · web-flow · commit 5c6a6e982328 · 2019-11-08T17:16:36.000+01:00
use only word level featurizer for intents
diff --git a/CHANGELOG.rst b/CHANGELOG.rst
@@ -32,6 +32,7 @@ Fixed
 - ``MultiProjectImporter`` now imports files in the order of the import statements
 - Fixed server hanging forever on leaving ``rasa shell`` before first message
 - Fixed rasa init showing traceback error when user does Keyboard Interrupt before choosing a project path
+- ``CountVectorsFeaturizer`` featurizes intents only if its analyzer is set to ``word``
 
 [1.4.2] - 2019-10-28
 ^^^^^^^^^^^^^^^^^^^^
diff --git a/rasa/nlu/featurizers/count_vectors_featurizer.py b/rasa/nlu/featurizers/count_vectors_featurizer.py
@@ -153,16 +153,6 @@ def _get_attribute_vocabulary(self, attribute: Text) -> Optional[Dict[Text, int]
         except (AttributeError, TypeError):
             return None
 
-    def _collect_vectorizer_vocabularies(self):
-        """Get vocabulary for all attributes"""
-
-        attribute_vocabularies = {}
-        for attribute in MESSAGE_ATTRIBUTES:
-            attribute_vocabularies[attribute] = self._get_attribute_vocabulary(
-                attribute
-            )
-        return attribute_vocabularies
-
     def _get_attribute_vocabulary_tokens(self, attribute: Text) -> Optional[List[Text]]:
         """Get all keys of vocabulary of an attribute"""
 
@@ -192,6 +182,15 @@ def _check_analyzer(self):
                     "contain single letters only."
                 )
 
+    @staticmethod
+    def _attributes(analyzer):
+        """Create a list of attributes that should be featurized."""
+
+        # intents should be featurized only by word level count vectorizer
+        return (
+            MESSAGE_ATTRIBUTES if analyzer == "word" else SPACY_FEATURIZABLE_ATTRIBUTES
+        )
+
     def __init__(
         self,
         component_config: Dict[Text, Any] = None,
@@ -210,6 +209,9 @@ def __init__(
         # warn that some of config parameters might be ignored
         self._check_analyzer()
 
+        # set which attributes to featurize
+        self._attributes = self._attributes(self.analyzer)
+
         # declare class instance for CountVectorizer
         self.vectorizers = vectorizers
 
@@ -335,7 +337,7 @@ def _get_all_attributes_processed_texts(
         """Get processed text for all attributes of examples in training data"""
 
         processed_attribute_texts = {}
-        for attribute in MESSAGE_ATTRIBUTES:
+        for attribute in self._attributes:
             attribute_texts = [
                 self._get_message_text_by_attribute(example, attribute)
                 for example in training_data.intent_examples
@@ -344,82 +346,10 @@ def _get_all_attributes_processed_texts(
             processed_attribute_texts[attribute] = attribute_texts
         return processed_attribute_texts
 
-    @staticmethod
-    def create_shared_vocab_vectorizers(
-        token_pattern,
-        strip_accents,
-        lowercase,
-        stop_words,
-        ngram_range,
-        max_df,
-        min_df,
-        max_features,
-        analyzer,
-        vocabulary=None,
-    ) -> Dict[Text, "CountVectorizer"]:
-        """Create vectorizers for all attributes with shared vocabulary"""
-
-        shared_vectorizer = CountVectorizer(
-            token_pattern=token_pattern,
-            strip_accents=strip_accents,
-            lowercase=lowercase,
-            stop_words=stop_words,
-            ngram_range=ngram_range,
-            max_df=max_df,
-            min_df=min_df,
-            max_features=max_features,
-            analyzer=analyzer,
-            vocabulary=vocabulary,
-        )
-
-        attribute_vectorizers = {}
-
-        for attribute in MESSAGE_ATTRIBUTES:
-            attribute_vectorizers[attribute] = shared_vectorizer
-
-        return attribute_vectorizers
-
-    @staticmethod
-    def create_independent_vocab_vectorizers(
-        token_pattern,
-        strip_accents,
-        lowercase,
-        stop_words,
-        ngram_range,
-        max_df,
-        min_df,
-        max_features,
-        analyzer,
-        vocabulary=None,
-    ) -> Dict[Text, "CountVectorizer"]:
-        """Create vectorizers for all attributes with independent vocabulary"""
-
-        attribute_vectorizers = {}
-
-        for attribute in MESSAGE_ATTRIBUTES:
-
-            attribute_vocabulary = vocabulary[attribute] if vocabulary else None
-
-            attribute_vectorizer = CountVectorizer(
-                token_pattern=token_pattern,
-                strip_accents=strip_accents,
-                lowercase=lowercase,
-                stop_words=stop_words,
-                ngram_range=ngram_range,
-                max_df=max_df,
-                min_df=min_df,
-                max_features=max_features,
-                analyzer=analyzer,
-                vocabulary=attribute_vocabulary,
-            )
-            attribute_vectorizers[attribute] = attribute_vectorizer
-
-        return attribute_vectorizers
-
     def _train_with_shared_vocab(self, attribute_texts: Dict[Text, List[Text]]):
         """Construct the vectorizers and train them with a shared vocab"""
 
-        self.vectorizers = self.create_shared_vocab_vectorizers(
+        self.vectorizers = self._create_shared_vocab_vectorizers(
             self.token_pattern,
             self.strip_accents,
             self.lowercase,
@@ -432,7 +362,7 @@ def _train_with_shared_vocab(self, attribute_texts: Dict[Text, List[Text]]):
         )
 
         combined_cleaned_texts = []
-        for attribute in MESSAGE_ATTRIBUTES:
+        for attribute in self._attributes:
             combined_cleaned_texts += attribute_texts[attribute]
 
         try:
@@ -449,7 +379,7 @@ def _attribute_texts_is_non_empty(attribute_texts):
     def _train_with_independent_vocab(self, attribute_texts: Dict[Text, List[Text]]):
         """Construct the vectorizers and train them with an independent vocab"""
 
-        self.vectorizers = self.create_independent_vocab_vectorizers(
+        self.vectorizers = self._create_independent_vocab_vectorizers(
             self.token_pattern,
             self.strip_accents,
             self.lowercase,
@@ -461,7 +391,7 @@ def _train_with_independent_vocab(self, attribute_texts: Dict[Text, List[Text]])
             self.analyzer,
         )
 
-        for attribute in MESSAGE_ATTRIBUTES:
+        for attribute in self._attributes:
             if self._attribute_texts_is_non_empty(attribute_texts[attribute]):
                 try:
                     self.vectorizers[attribute].fit(attribute_texts[attribute])
@@ -516,7 +446,7 @@ def train(
             self._train_with_independent_vocab(processed_attribute_texts)
 
         # transform for all attributes
-        for attribute in MESSAGE_ATTRIBUTES:
+        for attribute in self._attributes:
 
             attribute_features = self._get_featurized_attribute(
                 attribute, processed_attribute_texts[attribute]
@@ -556,6 +486,16 @@ def process(self, message: Message, **kwargs: Any) -> None:
                 ),
             )
 
+    def _collect_vectorizer_vocabularies(self):
+        """Get vocabulary for all attributes"""
+
+        attribute_vocabularies = {}
+        for attribute in self._attributes:
+            attribute_vocabularies[attribute] = self._get_attribute_vocabulary(
+                attribute
+            )
+        return attribute_vocabularies
+
     @staticmethod
     def _is_any_model_trained(attribute_vocabularies) -> bool:
         """Check if any model got trained"""
@@ -586,6 +526,80 @@ def persist(self, file_name: Text, model_dir: Text) -> Optional[Dict[Text, Any]]
                     utils.json_pickle(featurizer_file, attribute_vocabularies)
         return {"file": file_name}
 
+    @classmethod
+    def _create_shared_vocab_vectorizers(
+        cls,
+        token_pattern,
+        strip_accents,
+        lowercase,
+        stop_words,
+        ngram_range,
+        max_df,
+        min_df,
+        max_features,
+        analyzer,
+        vocabulary=None,
+    ) -> Dict[Text, "CountVectorizer"]:
+        """Create vectorizers for all attributes with shared vocabulary"""
+
+        shared_vectorizer = CountVectorizer(
+            token_pattern=token_pattern,
+            strip_accents=strip_accents,
+            lowercase=lowercase,
+            stop_words=stop_words,
+            ngram_range=ngram_range,
+            max_df=max_df,
+            min_df=min_df,
+            max_features=max_features,
+            analyzer=analyzer,
+            vocabulary=vocabulary,
+        )
+
+        attribute_vectorizers = {}
+
+        for attribute in cls._attributes(analyzer):
+            attribute_vectorizers[attribute] = shared_vectorizer
+
+        return attribute_vectorizers
+
+    @classmethod
+    def _create_independent_vocab_vectorizers(
+        cls,
+        token_pattern,
+        strip_accents,
+        lowercase,
+        stop_words,
+        ngram_range,
+        max_df,
+        min_df,
+        max_features,
+        analyzer,
+        vocabulary=None,
+    ) -> Dict[Text, "CountVectorizer"]:
+        """Create vectorizers for all attributes with independent vocabulary"""
+
+        attribute_vectorizers = {}
+
+        for attribute in cls._attributes(analyzer):
+
+            attribute_vocabulary = vocabulary[attribute] if vocabulary else None
+
+            attribute_vectorizer = CountVectorizer(
+                token_pattern=token_pattern,
+                strip_accents=strip_accents,
+                lowercase=lowercase,
+                stop_words=stop_words,
+                ngram_range=ngram_range,
+                max_df=max_df,
+                min_df=min_df,
+                max_features=max_features,
+                analyzer=analyzer,
+                vocabulary=attribute_vocabulary,
+            )
+            attribute_vectorizers[attribute] = attribute_vectorizer
+
+        return attribute_vectorizers
+
     @classmethod
     def load(
         cls,
@@ -605,7 +619,7 @@ def load(
             share_vocabulary = meta["use_shared_vocab"]
 
             if share_vocabulary:
-                vectorizers = cls.create_shared_vocab_vectorizers(
+                vectorizers = cls._create_shared_vocab_vectorizers(
                     token_pattern=meta["token_pattern"],
                     strip_accents=meta["strip_accents"],
                     lowercase=meta["lowercase"],
@@ -618,7 +632,7 @@ def load(
                     vocabulary=vocabulary,
                 )
             else:
-                vectorizers = cls.create_independent_vocab_vectorizers(
+                vectorizers = cls._create_independent_vocab_vectorizers(
                     token_pattern=meta["token_pattern"],
                     strip_accents=meta["strip_accents"],
                     lowercase=meta["lowercase"],
diff --git a/tests/nlu/base/test_featurizers.py b/tests/nlu/base/test_featurizers.py
@@ -461,6 +461,24 @@ def test_count_vector_featurizer_char(sentence, expected):
     assert np.all(test_message.get("text_features") == expected)
 
 
+def test_count_vector_featurizer_char_intent_featurizer():
+    from rasa.nlu.featurizers.count_vectors_featurizer import CountVectorsFeaturizer
+
+    ftr = CountVectorsFeaturizer({"min_ngram": 1, "max_ngram": 2, "analyzer": "char"})
+    td = training_data.load_data("data/examples/rasa/demo-rasa.json")
+    ftr.train(td, config=None)
+
+    intent_features_exist = np.array(
+        [
+            True if example.get("intent_features") is not None else False
+            for example in td.intent_examples
+        ]
+    )
+
+    # no intent features should have been set
+    assert not any(intent_features_exist)
+
+
 def test_count_vector_featurizer_persist_load(tmpdir):
     from rasa.nlu.featurizers.count_vectors_featurizer import CountVectorsFeaturizer