Skip to content

Commit 2a4ed65

Browse files
committed
use only word level featurizer for intents. add test
1 parent 6f920ba commit 2a4ed65

File tree

3 files changed

+122
-90
lines changed

3 files changed

+122
-90
lines changed

rasa/nlu/classifiers/embedding_intent_classifier.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -517,6 +517,8 @@ def train(
517517

518518
session_data = self.preprocess_train_data(training_data)
519519

520+
print(session_data.Y.shape)
521+
exit()
520522
possible_to_train = self._check_enough_labels(session_data)
521523

522524
if not possible_to_train:

rasa/nlu/featurizers/count_vectors_featurizer.py

Lines changed: 102 additions & 90 deletions
Original file line numberDiff line numberDiff line change
@@ -153,16 +153,6 @@ def _get_attribute_vocabulary(self, attribute: Text) -> Optional[Dict[Text, int]
153153
except (AttributeError, TypeError):
154154
return None
155155

156-
def _collect_vectorizer_vocabularies(self):
157-
"""Get vocabulary for all attributes"""
158-
159-
attribute_vocabularies = {}
160-
for attribute in MESSAGE_ATTRIBUTES:
161-
attribute_vocabularies[attribute] = self._get_attribute_vocabulary(
162-
attribute
163-
)
164-
return attribute_vocabularies
165-
166156
def _get_attribute_vocabulary_tokens(self, attribute: Text) -> Optional[List[Text]]:
167157
"""Get all keys of vocabulary of an attribute"""
168158

@@ -192,6 +182,13 @@ def _check_analyzer(self):
192182
"contain single letters only."
193183
)
194184

185+
@staticmethod
186+
def _attributes(analyzer):
187+
"""Create a list of attributes that should be featurized."""
188+
189+
# intents should be featurized only by word level count vectorizer
190+
return MESSAGE_ATTRIBUTES if analyzer == "word" else SPACY_FEATURIZABLE_ATTRIBUTES
191+
195192
def __init__(
196193
self,
197194
component_config: Dict[Text, Any] = None,
@@ -210,6 +207,9 @@ def __init__(
210207
# warn that some of config parameters might be ignored
211208
self._check_analyzer()
212209

210+
# set which attributes to featurize
211+
self._attributes = self._attributes(self.analyzer)
212+
213213
# declare class instance for CountVectorizer
214214
self.vectorizers = vectorizers
215215

@@ -335,7 +335,7 @@ def _get_all_attributes_processed_texts(
335335
"""Get processed text for all attributes of examples in training data"""
336336

337337
processed_attribute_texts = {}
338-
for attribute in MESSAGE_ATTRIBUTES:
338+
for attribute in self._attributes:
339339
attribute_texts = [
340340
self._get_message_text_by_attribute(example, attribute)
341341
for example in training_data.intent_examples
@@ -344,82 +344,10 @@ def _get_all_attributes_processed_texts(
344344
processed_attribute_texts[attribute] = attribute_texts
345345
return processed_attribute_texts
346346

347-
@staticmethod
348-
def create_shared_vocab_vectorizers(
349-
token_pattern,
350-
strip_accents,
351-
lowercase,
352-
stop_words,
353-
ngram_range,
354-
max_df,
355-
min_df,
356-
max_features,
357-
analyzer,
358-
vocabulary=None,
359-
) -> Dict[Text, "CountVectorizer"]:
360-
"""Create vectorizers for all attributes with shared vocabulary"""
361-
362-
shared_vectorizer = CountVectorizer(
363-
token_pattern=token_pattern,
364-
strip_accents=strip_accents,
365-
lowercase=lowercase,
366-
stop_words=stop_words,
367-
ngram_range=ngram_range,
368-
max_df=max_df,
369-
min_df=min_df,
370-
max_features=max_features,
371-
analyzer=analyzer,
372-
vocabulary=vocabulary,
373-
)
374-
375-
attribute_vectorizers = {}
376-
377-
for attribute in MESSAGE_ATTRIBUTES:
378-
attribute_vectorizers[attribute] = shared_vectorizer
379-
380-
return attribute_vectorizers
381-
382-
@staticmethod
383-
def create_independent_vocab_vectorizers(
384-
token_pattern,
385-
strip_accents,
386-
lowercase,
387-
stop_words,
388-
ngram_range,
389-
max_df,
390-
min_df,
391-
max_features,
392-
analyzer,
393-
vocabulary=None,
394-
) -> Dict[Text, "CountVectorizer"]:
395-
"""Create vectorizers for all attributes with independent vocabulary"""
396-
397-
attribute_vectorizers = {}
398-
399-
for attribute in MESSAGE_ATTRIBUTES:
400-
401-
attribute_vocabulary = vocabulary[attribute] if vocabulary else None
402-
403-
attribute_vectorizer = CountVectorizer(
404-
token_pattern=token_pattern,
405-
strip_accents=strip_accents,
406-
lowercase=lowercase,
407-
stop_words=stop_words,
408-
ngram_range=ngram_range,
409-
max_df=max_df,
410-
min_df=min_df,
411-
max_features=max_features,
412-
analyzer=analyzer,
413-
vocabulary=attribute_vocabulary,
414-
)
415-
attribute_vectorizers[attribute] = attribute_vectorizer
416-
417-
return attribute_vectorizers
418-
419347
def _train_with_shared_vocab(self, attribute_texts: Dict[Text, List[Text]]):
420348
"""Construct the vectorizers and train them with a shared vocab"""
421349

422-
self.vectorizers = self.create_shared_vocab_vectorizers(
350+
self.vectorizers = self._create_shared_vocab_vectorizers(
423351
self.token_pattern,
424352
self.strip_accents,
425353
self.lowercase,
@@ -432,7 +360,7 @@ def _train_with_shared_vocab(self, attribute_texts: Dict[Text, List[Text]]):
432360
)
433361

434362
combined_cleaned_texts = []
435-
for attribute in MESSAGE_ATTRIBUTES:
363+
for attribute in self._attributes:
436364
combined_cleaned_texts += attribute_texts[attribute]
437365

438366
try:
@@ -449,7 +377,7 @@ def _attribute_texts_is_non_empty(attribute_texts):
449377
def _train_with_independent_vocab(self, attribute_texts: Dict[Text, List[Text]]):
450378
"""Construct the vectorizers and train them with an independent vocab"""
451379

452-
self.vectorizers = self.create_independent_vocab_vectorizers(
380+
self.vectorizers = self._create_independent_vocab_vectorizers(
453381
self.token_pattern,
454382
self.strip_accents,
455383
self.lowercase,
@@ -461,7 +389,7 @@ def _train_with_independent_vocab(self, attribute_texts: Dict[Text, List[Text]])
461389
self.analyzer,
462390
)
463391

464-
for attribute in MESSAGE_ATTRIBUTES:
392+
for attribute in self._attributes:
465393
if self._attribute_texts_is_non_empty(attribute_texts[attribute]):
466394
try:
467395
self.vectorizers[attribute].fit(attribute_texts[attribute])
@@ -516,7 +444,7 @@ def train(
516444
self._train_with_independent_vocab(processed_attribute_texts)
517445

518446
# transform for all attributes
519-
for attribute in MESSAGE_ATTRIBUTES:
447+
for attribute in self._attributes:
520448

521449
attribute_features = self._get_featurized_attribute(
522450
attribute, processed_attribute_texts[attribute]
@@ -556,6 +484,16 @@ def process(self, message: Message, **kwargs: Any) -> None:
556484
),
557485
)
558486

487+
def _collect_vectorizer_vocabularies(self):
488+
"""Get vocabulary for all attributes"""
489+
490+
attribute_vocabularies = {}
491+
for attribute in self._attributes:
492+
attribute_vocabularies[attribute] = self._get_attribute_vocabulary(
493+
attribute
494+
)
495+
return attribute_vocabularies
496+
559497
@staticmethod
560498
def _is_any_model_trained(attribute_vocabularies) -> bool:
561499
"""Check if any model got trained"""
@@ -586,6 +524,80 @@ def persist(self, file_name: Text, model_dir: Text) -> Optional[Dict[Text, Any]]
586524
utils.json_pickle(featurizer_file, attribute_vocabularies)
587525
return {"file": file_name}
588526

527+
@classmethod
528+
def _create_shared_vocab_vectorizers(
529+
cls,
530+
token_pattern,
531+
strip_accents,
532+
lowercase,
533+
stop_words,
534+
ngram_range,
535+
max_df,
536+
min_df,
537+
max_features,
538+
analyzer,
539+
vocabulary=None,
540+
) -> Dict[Text, "CountVectorizer"]:
541+
"""Create vectorizers for all attributes with shared vocabulary"""
542+
543+
shared_vectorizer = CountVectorizer(
544+
token_pattern=token_pattern,
545+
strip_accents=strip_accents,
546+
lowercase=lowercase,
547+
stop_words=stop_words,
548+
ngram_range=ngram_range,
549+
max_df=max_df,
550+
min_df=min_df,
551+
max_features=max_features,
552+
analyzer=analyzer,
553+
vocabulary=vocabulary,
554+
)
555+
556+
attribute_vectorizers = {}
557+
558+
for attribute in cls._attributes(analyzer):
559+
attribute_vectorizers[attribute] = shared_vectorizer
560+
561+
return attribute_vectorizers
562+
563+
@classmethod
564+
def _create_independent_vocab_vectorizers(
565+
cls,
566+
token_pattern,
567+
strip_accents,
568+
lowercase,
569+
stop_words,
570+
ngram_range,
571+
max_df,
572+
min_df,
573+
max_features,
574+
analyzer,
575+
vocabulary=None,
576+
) -> Dict[Text, "CountVectorizer"]:
577+
"""Create vectorizers for all attributes with independent vocabulary"""
578+
579+
attribute_vectorizers = {}
580+
581+
for attribute in cls._attributes(analyzer):
582+
583+
attribute_vocabulary = vocabulary[attribute] if vocabulary else None
584+
585+
attribute_vectorizer = CountVectorizer(
586+
token_pattern=token_pattern,
587+
strip_accents=strip_accents,
588+
lowercase=lowercase,
589+
stop_words=stop_words,
590+
ngram_range=ngram_range,
591+
max_df=max_df,
592+
min_df=min_df,
593+
max_features=max_features,
594+
analyzer=analyzer,
595+
vocabulary=attribute_vocabulary,
596+
)
597+
attribute_vectorizers[attribute] = attribute_vectorizer
598+
599+
return attribute_vectorizers
600+
589601
@classmethod
590602
def load(
591603
cls,
@@ -605,7 +617,7 @@ def load(
605617
share_vocabulary = meta["use_shared_vocab"]
606618

607619
if share_vocabulary:
608-
vectorizers = cls.create_shared_vocab_vectorizers(
620+
vectorizers = cls._create_shared_vocab_vectorizers(
609621
token_pattern=meta["token_pattern"],
610622
strip_accents=meta["strip_accents"],
611623
lowercase=meta["lowercase"],
@@ -618,7 +630,7 @@ def load(
618630
vocabulary=vocabulary,
619631
)
620632
else:
621-
vectorizers = cls.create_independent_vocab_vectorizers(
633+
vectorizers = cls._create_independent_vocab_vectorizers(
622634
token_pattern=meta["token_pattern"],
623635
strip_accents=meta["strip_accents"],
624636
lowercase=meta["lowercase"],

tests/nlu/base/test_featurizers.py

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -461,6 +461,24 @@ def test_count_vector_featurizer_char(sentence, expected):
461461
assert np.all(test_message.get("text_features") == expected)
462462

463463

464+
def test_count_vector_featurizer_char_intent_featurizer():
465+
from rasa.nlu.featurizers.count_vectors_featurizer import CountVectorsFeaturizer
466+
467+
ftr = CountVectorsFeaturizer({"min_ngram": 1, "max_ngram": 2, "analyzer": "char"})
468+
td = training_data.load_data("data/examples/rasa/demo-rasa.json")
469+
ftr.train(td, config=None)
470+
471+
intent_features_exist = np.array(
472+
[
473+
True if example.get("intent_features") is not None else False
474+
for example in td.intent_examples
475+
]
476+
)
477+
478+
# no intent features should have been set
479+
assert not any(intent_features_exist)
480+
481+
464482
def test_count_vector_featurizer_persist_load(tmpdir):
465483
from rasa.nlu.featurizers.count_vectors_featurizer import CountVectorsFeaturizer
466484

0 commit comments

Comments
 (0)