@@ -153,16 +153,6 @@ def _get_attribute_vocabulary(self, attribute: Text) -> Optional[Dict[Text, int]
153
153
except (AttributeError , TypeError ):
154
154
return None
155
155
156
- def _collect_vectorizer_vocabularies (self ):
157
- """Get vocabulary for all attributes"""
158
-
159
- attribute_vocabularies = {}
160
- for attribute in MESSAGE_ATTRIBUTES :
161
- attribute_vocabularies [attribute ] = self ._get_attribute_vocabulary (
162
- attribute
163
- )
164
- return attribute_vocabularies
165
-
166
156
def _get_attribute_vocabulary_tokens (self , attribute : Text ) -> Optional [List [Text ]]:
167
157
"""Get all keys of vocabulary of an attribute"""
168
158
@@ -192,6 +182,15 @@ def _check_analyzer(self):
192
182
"contain single letters only."
193
183
)
194
184
185
+ @staticmethod
186
+ def _attributes (analyzer ):
187
+ """Create a list of attributes that should be featurized."""
188
+
189
+ # intents should be featurized only by word level count vectorizer
190
+ return (
191
+ MESSAGE_ATTRIBUTES if analyzer == "word" else SPACY_FEATURIZABLE_ATTRIBUTES
192
+ )
193
+
195
194
def __init__ (
196
195
self ,
197
196
component_config : Dict [Text , Any ] = None ,
@@ -210,6 +209,9 @@ def __init__(
210
209
# warn that some of config parameters might be ignored
211
210
self ._check_analyzer ()
212
211
212
+ # set which attributes to featurize
213
+ self ._attributes = self ._attributes (self .analyzer )
214
+
213
215
# declare class instance for CountVectorizer
214
216
self .vectorizers = vectorizers
215
217
@@ -335,7 +337,7 @@ def _get_all_attributes_processed_texts(
335
337
"""Get processed text for all attributes of examples in training data"""
336
338
337
339
processed_attribute_texts = {}
338
- for attribute in MESSAGE_ATTRIBUTES :
340
+ for attribute in self . _attributes :
339
341
attribute_texts = [
340
342
self ._get_message_text_by_attribute (example , attribute )
341
343
for example in training_data .intent_examples
@@ -344,82 +346,10 @@ def _get_all_attributes_processed_texts(
344
346
processed_attribute_texts [attribute ] = attribute_texts
345
347
return processed_attribute_texts
346
348
347
- @staticmethod
348
- def create_shared_vocab_vectorizers (
349
- token_pattern ,
350
- strip_accents ,
351
- lowercase ,
352
- stop_words ,
353
- ngram_range ,
354
- max_df ,
355
- min_df ,
356
- max_features ,
357
- analyzer ,
358
- vocabulary = None ,
359
- ) -> Dict [Text , "CountVectorizer" ]:
360
- """Create vectorizers for all attributes with shared vocabulary"""
361
-
362
- shared_vectorizer = CountVectorizer (
363
- token_pattern = token_pattern ,
364
- strip_accents = strip_accents ,
365
- lowercase = lowercase ,
366
- stop_words = stop_words ,
367
- ngram_range = ngram_range ,
368
- max_df = max_df ,
369
- min_df = min_df ,
370
- max_features = max_features ,
371
- analyzer = analyzer ,
372
- vocabulary = vocabulary ,
373
- )
374
-
375
- attribute_vectorizers = {}
376
-
377
- for attribute in MESSAGE_ATTRIBUTES :
378
- attribute_vectorizers [attribute ] = shared_vectorizer
379
-
380
- return attribute_vectorizers
381
-
382
- @staticmethod
383
- def create_independent_vocab_vectorizers (
384
- token_pattern ,
385
- strip_accents ,
386
- lowercase ,
387
- stop_words ,
388
- ngram_range ,
389
- max_df ,
390
- min_df ,
391
- max_features ,
392
- analyzer ,
393
- vocabulary = None ,
394
- ) -> Dict [Text , "CountVectorizer" ]:
395
- """Create vectorizers for all attributes with independent vocabulary"""
396
-
397
- attribute_vectorizers = {}
398
-
399
- for attribute in MESSAGE_ATTRIBUTES :
400
-
401
- attribute_vocabulary = vocabulary [attribute ] if vocabulary else None
402
-
403
- attribute_vectorizer = CountVectorizer (
404
- token_pattern = token_pattern ,
405
- strip_accents = strip_accents ,
406
- lowercase = lowercase ,
407
- stop_words = stop_words ,
408
- ngram_range = ngram_range ,
409
- max_df = max_df ,
410
- min_df = min_df ,
411
- max_features = max_features ,
412
- analyzer = analyzer ,
413
- vocabulary = attribute_vocabulary ,
414
- )
415
- attribute_vectorizers [attribute ] = attribute_vectorizer
416
-
417
- return attribute_vectorizers
418
-
419
349
def _train_with_shared_vocab (self , attribute_texts : Dict [Text , List [Text ]]):
420
350
"""Construct the vectorizers and train them with a shared vocab"""
421
351
422
- self .vectorizers = self .create_shared_vocab_vectorizers (
352
+ self .vectorizers = self ._create_shared_vocab_vectorizers (
423
353
self .token_pattern ,
424
354
self .strip_accents ,
425
355
self .lowercase ,
@@ -432,7 +362,7 @@ def _train_with_shared_vocab(self, attribute_texts: Dict[Text, List[Text]]):
432
362
)
433
363
434
364
combined_cleaned_texts = []
435
- for attribute in MESSAGE_ATTRIBUTES :
365
+ for attribute in self . _attributes :
436
366
combined_cleaned_texts += attribute_texts [attribute ]
437
367
438
368
try :
@@ -449,7 +379,7 @@ def _attribute_texts_is_non_empty(attribute_texts):
449
379
def _train_with_independent_vocab (self , attribute_texts : Dict [Text , List [Text ]]):
450
380
"""Construct the vectorizers and train them with an independent vocab"""
451
381
452
- self .vectorizers = self .create_independent_vocab_vectorizers (
382
+ self .vectorizers = self ._create_independent_vocab_vectorizers (
453
383
self .token_pattern ,
454
384
self .strip_accents ,
455
385
self .lowercase ,
@@ -461,7 +391,7 @@ def _train_with_independent_vocab(self, attribute_texts: Dict[Text, List[Text]])
461
391
self .analyzer ,
462
392
)
463
393
464
- for attribute in MESSAGE_ATTRIBUTES :
394
+ for attribute in self . _attributes :
465
395
if self ._attribute_texts_is_non_empty (attribute_texts [attribute ]):
466
396
try :
467
397
self .vectorizers [attribute ].fit (attribute_texts [attribute ])
@@ -516,7 +446,7 @@ def train(
516
446
self ._train_with_independent_vocab (processed_attribute_texts )
517
447
518
448
# transform for all attributes
519
- for attribute in MESSAGE_ATTRIBUTES :
449
+ for attribute in self . _attributes :
520
450
521
451
attribute_features = self ._get_featurized_attribute (
522
452
attribute , processed_attribute_texts [attribute ]
@@ -556,6 +486,16 @@ def process(self, message: Message, **kwargs: Any) -> None:
556
486
),
557
487
)
558
488
489
+ def _collect_vectorizer_vocabularies (self ):
490
+ """Get vocabulary for all attributes"""
491
+
492
+ attribute_vocabularies = {}
493
+ for attribute in self ._attributes :
494
+ attribute_vocabularies [attribute ] = self ._get_attribute_vocabulary (
495
+ attribute
496
+ )
497
+ return attribute_vocabularies
498
+
559
499
@staticmethod
560
500
def _is_any_model_trained (attribute_vocabularies ) -> bool :
561
501
"""Check if any model got trained"""
@@ -586,6 +526,80 @@ def persist(self, file_name: Text, model_dir: Text) -> Optional[Dict[Text, Any]]
586
526
utils .json_pickle (featurizer_file , attribute_vocabularies )
587
527
return {"file" : file_name }
588
528
529
+ @classmethod
530
+ def _create_shared_vocab_vectorizers (
531
+ cls ,
532
+ token_pattern ,
533
+ strip_accents ,
534
+ lowercase ,
535
+ stop_words ,
536
+ ngram_range ,
537
+ max_df ,
538
+ min_df ,
539
+ max_features ,
540
+ analyzer ,
541
+ vocabulary = None ,
542
+ ) -> Dict [Text , "CountVectorizer" ]:
543
+ """Create vectorizers for all attributes with shared vocabulary"""
544
+
545
+ shared_vectorizer = CountVectorizer (
546
+ token_pattern = token_pattern ,
547
+ strip_accents = strip_accents ,
548
+ lowercase = lowercase ,
549
+ stop_words = stop_words ,
550
+ ngram_range = ngram_range ,
551
+ max_df = max_df ,
552
+ min_df = min_df ,
553
+ max_features = max_features ,
554
+ analyzer = analyzer ,
555
+ vocabulary = vocabulary ,
556
+ )
557
+
558
+ attribute_vectorizers = {}
559
+
560
+ for attribute in cls ._attributes (analyzer ):
561
+ attribute_vectorizers [attribute ] = shared_vectorizer
562
+
563
+ return attribute_vectorizers
564
+
565
+ @classmethod
566
+ def _create_independent_vocab_vectorizers (
567
+ cls ,
568
+ token_pattern ,
569
+ strip_accents ,
570
+ lowercase ,
571
+ stop_words ,
572
+ ngram_range ,
573
+ max_df ,
574
+ min_df ,
575
+ max_features ,
576
+ analyzer ,
577
+ vocabulary = None ,
578
+ ) -> Dict [Text , "CountVectorizer" ]:
579
+ """Create vectorizers for all attributes with independent vocabulary"""
580
+
581
+ attribute_vectorizers = {}
582
+
583
+ for attribute in cls ._attributes (analyzer ):
584
+
585
+ attribute_vocabulary = vocabulary [attribute ] if vocabulary else None
586
+
587
+ attribute_vectorizer = CountVectorizer (
588
+ token_pattern = token_pattern ,
589
+ strip_accents = strip_accents ,
590
+ lowercase = lowercase ,
591
+ stop_words = stop_words ,
592
+ ngram_range = ngram_range ,
593
+ max_df = max_df ,
594
+ min_df = min_df ,
595
+ max_features = max_features ,
596
+ analyzer = analyzer ,
597
+ vocabulary = attribute_vocabulary ,
598
+ )
599
+ attribute_vectorizers [attribute ] = attribute_vectorizer
600
+
601
+ return attribute_vectorizers
602
+
589
603
@classmethod
590
604
def load (
591
605
cls ,
@@ -605,7 +619,7 @@ def load(
605
619
share_vocabulary = meta ["use_shared_vocab" ]
606
620
607
621
if share_vocabulary :
608
- vectorizers = cls .create_shared_vocab_vectorizers (
622
+ vectorizers = cls ._create_shared_vocab_vectorizers (
609
623
token_pattern = meta ["token_pattern" ],
610
624
strip_accents = meta ["strip_accents" ],
611
625
lowercase = meta ["lowercase" ],
@@ -618,7 +632,7 @@ def load(
618
632
vocabulary = vocabulary ,
619
633
)
620
634
else :
621
- vectorizers = cls .create_independent_vocab_vectorizers (
635
+ vectorizers = cls ._create_independent_vocab_vectorizers (
622
636
token_pattern = meta ["token_pattern" ],
623
637
strip_accents = meta ["strip_accents" ],
624
638
lowercase = meta ["lowercase" ],
0 commit comments