@@ -153,16 +153,6 @@ def _get_attribute_vocabulary(self, attribute: Text) -> Optional[Dict[Text, int]
153
153
except (AttributeError , TypeError ):
154
154
return None
155
155
156
- def _collect_vectorizer_vocabularies (self ):
157
- """Get vocabulary for all attributes"""
158
-
159
- attribute_vocabularies = {}
160
- for attribute in MESSAGE_ATTRIBUTES :
161
- attribute_vocabularies [attribute ] = self ._get_attribute_vocabulary (
162
- attribute
163
- )
164
- return attribute_vocabularies
165
-
166
156
def _get_attribute_vocabulary_tokens (self , attribute : Text ) -> Optional [List [Text ]]:
167
157
"""Get all keys of vocabulary of an attribute"""
168
158
@@ -192,6 +182,13 @@ def _check_analyzer(self):
192
182
"contain single letters only."
193
183
)
194
184
185
+ @staticmethod
186
+ def _attributes (analyzer ):
187
+ """Create a list of attributes that should be featurized."""
188
+
189
+ # intents should be featurized only by word level count vectorizer
190
+ return MESSAGE_ATTRIBUTES if analyzer == "word" else SPACY_FEATURIZABLE_ATTRIBUTES
191
+
195
192
def __init__ (
196
193
self ,
197
194
component_config : Dict [Text , Any ] = None ,
@@ -210,6 +207,9 @@ def __init__(
210
207
# warn that some of config parameters might be ignored
211
208
self ._check_analyzer ()
212
209
210
+ # set which attributes to featurize
211
+ self ._attributes = self ._attributes (self .analyzer )
212
+
213
213
# declare class instance for CountVectorizer
214
214
self .vectorizers = vectorizers
215
215
@@ -335,7 +335,7 @@ def _get_all_attributes_processed_texts(
335
335
"""Get processed text for all attributes of examples in training data"""
336
336
337
337
processed_attribute_texts = {}
338
- for attribute in MESSAGE_ATTRIBUTES :
338
+ for attribute in self . _attributes :
339
339
attribute_texts = [
340
340
self ._get_message_text_by_attribute (example , attribute )
341
341
for example in training_data .intent_examples
@@ -344,82 +344,10 @@ def _get_all_attributes_processed_texts(
344
344
processed_attribute_texts [attribute ] = attribute_texts
345
345
return processed_attribute_texts
346
346
347
- @staticmethod
348
- def create_shared_vocab_vectorizers (
349
- token_pattern ,
350
- strip_accents ,
351
- lowercase ,
352
- stop_words ,
353
- ngram_range ,
354
- max_df ,
355
- min_df ,
356
- max_features ,
357
- analyzer ,
358
- vocabulary = None ,
359
- ) -> Dict [Text , "CountVectorizer" ]:
360
- """Create vectorizers for all attributes with shared vocabulary"""
361
-
362
- shared_vectorizer = CountVectorizer (
363
- token_pattern = token_pattern ,
364
- strip_accents = strip_accents ,
365
- lowercase = lowercase ,
366
- stop_words = stop_words ,
367
- ngram_range = ngram_range ,
368
- max_df = max_df ,
369
- min_df = min_df ,
370
- max_features = max_features ,
371
- analyzer = analyzer ,
372
- vocabulary = vocabulary ,
373
- )
374
-
375
- attribute_vectorizers = {}
376
-
377
- for attribute in MESSAGE_ATTRIBUTES :
378
- attribute_vectorizers [attribute ] = shared_vectorizer
379
-
380
- return attribute_vectorizers
381
-
382
- @staticmethod
383
- def create_independent_vocab_vectorizers (
384
- token_pattern ,
385
- strip_accents ,
386
- lowercase ,
387
- stop_words ,
388
- ngram_range ,
389
- max_df ,
390
- min_df ,
391
- max_features ,
392
- analyzer ,
393
- vocabulary = None ,
394
- ) -> Dict [Text , "CountVectorizer" ]:
395
- """Create vectorizers for all attributes with independent vocabulary"""
396
-
397
- attribute_vectorizers = {}
398
-
399
- for attribute in MESSAGE_ATTRIBUTES :
400
-
401
- attribute_vocabulary = vocabulary [attribute ] if vocabulary else None
402
-
403
- attribute_vectorizer = CountVectorizer (
404
- token_pattern = token_pattern ,
405
- strip_accents = strip_accents ,
406
- lowercase = lowercase ,
407
- stop_words = stop_words ,
408
- ngram_range = ngram_range ,
409
- max_df = max_df ,
410
- min_df = min_df ,
411
- max_features = max_features ,
412
- analyzer = analyzer ,
413
- vocabulary = attribute_vocabulary ,
414
- )
415
- attribute_vectorizers [attribute ] = attribute_vectorizer
416
-
417
- return attribute_vectorizers
418
-
419
347
def _train_with_shared_vocab (self , attribute_texts : Dict [Text , List [Text ]]):
420
348
"""Construct the vectorizers and train them with a shared vocab"""
421
349
422
- self .vectorizers = self .create_shared_vocab_vectorizers (
350
+ self .vectorizers = self ._create_shared_vocab_vectorizers (
423
351
self .token_pattern ,
424
352
self .strip_accents ,
425
353
self .lowercase ,
@@ -432,7 +360,7 @@ def _train_with_shared_vocab(self, attribute_texts: Dict[Text, List[Text]]):
432
360
)
433
361
434
362
combined_cleaned_texts = []
435
- for attribute in MESSAGE_ATTRIBUTES :
363
+ for attribute in self . _attributes :
436
364
combined_cleaned_texts += attribute_texts [attribute ]
437
365
438
366
try :
@@ -449,7 +377,7 @@ def _attribute_texts_is_non_empty(attribute_texts):
449
377
def _train_with_independent_vocab (self , attribute_texts : Dict [Text , List [Text ]]):
450
378
"""Construct the vectorizers and train them with an independent vocab"""
451
379
452
- self .vectorizers = self .create_independent_vocab_vectorizers (
380
+ self .vectorizers = self ._create_independent_vocab_vectorizers (
453
381
self .token_pattern ,
454
382
self .strip_accents ,
455
383
self .lowercase ,
@@ -461,7 +389,7 @@ def _train_with_independent_vocab(self, attribute_texts: Dict[Text, List[Text]])
461
389
self .analyzer ,
462
390
)
463
391
464
- for attribute in MESSAGE_ATTRIBUTES :
392
+ for attribute in self . _attributes :
465
393
if self ._attribute_texts_is_non_empty (attribute_texts [attribute ]):
466
394
try :
467
395
self .vectorizers [attribute ].fit (attribute_texts [attribute ])
@@ -516,7 +444,7 @@ def train(
516
444
self ._train_with_independent_vocab (processed_attribute_texts )
517
445
518
446
# transform for all attributes
519
- for attribute in MESSAGE_ATTRIBUTES :
447
+ for attribute in self . _attributes :
520
448
521
449
attribute_features = self ._get_featurized_attribute (
522
450
attribute , processed_attribute_texts [attribute ]
@@ -556,6 +484,16 @@ def process(self, message: Message, **kwargs: Any) -> None:
556
484
),
557
485
)
558
486
487
+ def _collect_vectorizer_vocabularies (self ):
488
+ """Get vocabulary for all attributes"""
489
+
490
+ attribute_vocabularies = {}
491
+ for attribute in self ._attributes :
492
+ attribute_vocabularies [attribute ] = self ._get_attribute_vocabulary (
493
+ attribute
494
+ )
495
+ return attribute_vocabularies
496
+
559
497
@staticmethod
560
498
def _is_any_model_trained (attribute_vocabularies ) -> bool :
561
499
"""Check if any model got trained"""
@@ -586,6 +524,80 @@ def persist(self, file_name: Text, model_dir: Text) -> Optional[Dict[Text, Any]]
586
524
utils .json_pickle (featurizer_file , attribute_vocabularies )
587
525
return {"file" : file_name }
588
526
527
+ @classmethod
528
+ def _create_shared_vocab_vectorizers (
529
+ cls ,
530
+ token_pattern ,
531
+ strip_accents ,
532
+ lowercase ,
533
+ stop_words ,
534
+ ngram_range ,
535
+ max_df ,
536
+ min_df ,
537
+ max_features ,
538
+ analyzer ,
539
+ vocabulary = None ,
540
+ ) -> Dict [Text , "CountVectorizer" ]:
541
+ """Create vectorizers for all attributes with shared vocabulary"""
542
+
543
+ shared_vectorizer = CountVectorizer (
544
+ token_pattern = token_pattern ,
545
+ strip_accents = strip_accents ,
546
+ lowercase = lowercase ,
547
+ stop_words = stop_words ,
548
+ ngram_range = ngram_range ,
549
+ max_df = max_df ,
550
+ min_df = min_df ,
551
+ max_features = max_features ,
552
+ analyzer = analyzer ,
553
+ vocabulary = vocabulary ,
554
+ )
555
+
556
+ attribute_vectorizers = {}
557
+
558
+ for attribute in cls ._attributes (analyzer ):
559
+ attribute_vectorizers [attribute ] = shared_vectorizer
560
+
561
+ return attribute_vectorizers
562
+
563
+ @classmethod
564
+ def _create_independent_vocab_vectorizers (
565
+ cls ,
566
+ token_pattern ,
567
+ strip_accents ,
568
+ lowercase ,
569
+ stop_words ,
570
+ ngram_range ,
571
+ max_df ,
572
+ min_df ,
573
+ max_features ,
574
+ analyzer ,
575
+ vocabulary = None ,
576
+ ) -> Dict [Text , "CountVectorizer" ]:
577
+ """Create vectorizers for all attributes with independent vocabulary"""
578
+
579
+ attribute_vectorizers = {}
580
+
581
+ for attribute in cls ._attributes (analyzer ):
582
+
583
+ attribute_vocabulary = vocabulary [attribute ] if vocabulary else None
584
+
585
+ attribute_vectorizer = CountVectorizer (
586
+ token_pattern = token_pattern ,
587
+ strip_accents = strip_accents ,
588
+ lowercase = lowercase ,
589
+ stop_words = stop_words ,
590
+ ngram_range = ngram_range ,
591
+ max_df = max_df ,
592
+ min_df = min_df ,
593
+ max_features = max_features ,
594
+ analyzer = analyzer ,
595
+ vocabulary = attribute_vocabulary ,
596
+ )
597
+ attribute_vectorizers [attribute ] = attribute_vectorizer
598
+
599
+ return attribute_vectorizers
600
+
589
601
@classmethod
590
602
def load (
591
603
cls ,
@@ -605,7 +617,7 @@ def load(
605
617
share_vocabulary = meta ["use_shared_vocab" ]
606
618
607
619
if share_vocabulary :
608
- vectorizers = cls .create_shared_vocab_vectorizers (
620
+ vectorizers = cls ._create_shared_vocab_vectorizers (
609
621
token_pattern = meta ["token_pattern" ],
610
622
strip_accents = meta ["strip_accents" ],
611
623
lowercase = meta ["lowercase" ],
@@ -618,7 +630,7 @@ def load(
618
630
vocabulary = vocabulary ,
619
631
)
620
632
else :
621
- vectorizers = cls .create_independent_vocab_vectorizers (
633
+ vectorizers = cls ._create_independent_vocab_vectorizers (
622
634
token_pattern = meta ["token_pattern" ],
623
635
strip_accents = meta ["strip_accents" ],
624
636
lowercase = meta ["lowercase" ],
0 commit comments