leetcode-notes
diff --git a/‎changelog/5187.feature.rst‎
Lines changed: 7 additions & 0 deletions b/‎changelog/5187.feature.rst‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎docs/nlu/components.rst‎
Lines changed: 105 additions & 0 deletions b/‎docs/nlu/components.rst‎
Lines changed: 105 additions & 0 deletions
diff --git a/‎rasa/nlu/constants.py‎
Lines changed: 13 additions & 6 deletions b/‎rasa/nlu/constants.py‎
Lines changed: 13 additions & 6 deletions
diff --git a/‎rasa/nlu/featurizers/dense_featurizer/lm_featurizer.py‎
Lines changed: 62 additions & 0 deletions b/‎rasa/nlu/featurizers/dense_featurizer/lm_featurizer.py‎
Lines changed: 62 additions & 0 deletions
diff --git a/‎rasa/nlu/registry.py‎
Lines changed: 6 additions & 0 deletions b/‎rasa/nlu/registry.py‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎rasa/nlu/tokenizers/convert_tokenizer.py‎
Lines changed: 2 additions & 37 deletions b/‎rasa/nlu/tokenizers/convert_tokenizer.py‎
Lines changed: 2 additions & 37 deletions
diff --git a/‎rasa/nlu/tokenizers/lm_tokenizer.py‎
Lines changed: 36 additions & 0 deletions b/‎rasa/nlu/tokenizers/lm_tokenizer.py‎
Lines changed: 36 additions & 0 deletions
diff --git a/‎rasa/nlu/utils/hugging_face/__init__.py‎ b/‎rasa/nlu/utils/hugging_face/__init__.py‎
@@ -0,0 +1,7 @@
+Integrate language models from HuggingFace's Transformers Library.
+
+Add a new NLP component ``HFTransformersNLP`` which tokenizes and featurizes incoming messages using a specified
+pre-trained model with the Transformers library as the backend.
+Add ``LanguageModelTokenizers`` and ``LanguageModelFeaturizers`` which use the information from HFTransformersNLP and
+sets them correctly for message object.
+Language models currently supported: BERT, OpenAIGPT, GPT-2, XLNet, DistilBert, RoBERTa
@@ -82,6 +82,54 @@ SpacyNLP
           # between these two words, therefore setting this to `true`.
           case_sensitive: false
 
+
+.. _HFTransformersNLP:
+
+HFTransformersNLP
+~~~~~~~~~~~~~~~~~
+
+:Short: HuggingFace's Transformers based pre-trained language model initializer
+:Outputs: nothing
+:Requires: nothing
+:Description:
+    Initializes specified pre-trained language model from HuggingFace's `Transformers library
+    <https://huggingface.co/transformers/>`__.  The component applies language model specific tokenization and featurization
+    to compute sequence and sentence level representations for each example in the training data.
+    Include :ref:`LanguageModelTokenizer` and :ref:`LanguageModelFeaturizer` to utilize the output of this
+    component for downstream NLU models.
+:Configuration:
+
+    .. code-block:: yaml
+
+        pipeline:
+          - name: HFTransformersNLP
+
+            # Name of the language model to use
+            model_name: "bert"
+
+            # Shortcut name to specify architecture variation of the above model. Full list of supported architectures
+            # can be found at https://huggingface.co/transformers/pretrained_models.html . If left empty, it uses the
+            # default model architecture that original transformers library loads
+            model_weights: "bert-base-uncased"
+
+        #    +----------------+--------------+-------------------------+
+        #    | Language Model | Parameter    | Default value for       |
+        #    |                | "model_name" | "model_weights"         |
+        #    +----------------+--------------+-------------------------+
+        #    | BERT           | bert         | bert-base-uncased       |
+        #    +----------------+--------------+-------------------------+
+        #    | GPT            | gpt          | openai-gpt              |
+        #    +----------------+--------------+-------------------------+
+        #    | GPT-2          | gpt2         | gpt2                    |
+        #    +----------------+--------------+-------------------------+
+        #    | XLNet          | xlnet        | xlnet-base-cased        |
+        #    +----------------+--------------+-------------------------+
+        #    | DistilBERT     | distilbert   | distilbert-base-uncased |
+        #    +----------------+--------------+-------------------------+
+        #    | RoBERTa        | roberta      | roberta-base            |
+        #    +----------------+--------------+-------------------------+
+
+
 Text Featurizers
 ----------------
 
@@ -182,6 +230,40 @@ ConveRTFeaturizer
         - name: "ConveRTFeaturizer"
 
 
+.. _LanguageModelFeaturizer:
+
+LanguageModelFeaturizer
+~~~~~~~~~~~~~~~~~~~~~~~~
+
+:Short:
+    Creates a vector representation of user message and response (if specified) using a pre-trained language model.
+:Outputs:
+    nothing, used as an input to intent classifiers and response selectors that need intent features and response
+    features respectively (e.g. ``DIETClassifier`` and ``ResponseSelector``)
+:Requires: :ref:`HFTransformersNLP`
+:Type: Dense featurizer
+:Description:
+    Creates features for intent classification and response selection.
+    Uses the pre-trained language model specified in upstream :ref:`HFTransformersNLP` component to compute vector
+    representations of input text.
+
+    .. warning::
+        Please make sure that you use a language model which is pre-trained on the same language corpus as that of your
+        training data.
+
+:Configuration:
+
+    Include ``HFTransformersNLP`` component before this component. Also, use :ref:`LanguageModelTokenizer` to ensure tokens
+    are correctly set for all components throughout the pipeline.
+
+    .. code-block:: yaml
+
+        pipeline:
+        - name: "HFTransformersNLP"
+          model_name: # Name of language model to use
+        - name: "LanguageModelFeaturizer"
+
+
 RegexFeaturizer
 ~~~~~~~~~~~~~~~
 
@@ -784,6 +866,29 @@ ConveRTTokenizer
     Creates tokens using the ConveRT tokenizer. Must be used whenever the ``ConveRTFeaturizer`` is used.
 
 
+.. _LanguageModelTokenizer:
+
+LanguageModelTokenizer
+~~~~~~~~~~~~~~~~~~~~~~
+
+:Short: Tokenizer from pre-trained language models
+:Outputs: nothing
+:Requires: :ref:`HFTransformersNLP`
+:Description:
+    Creates tokens using the pre-trained language model specified in upstream :ref:`HFTransformersNLP` component.
+    Must be used whenever the ``LanguageModelFeaturizer`` is used.
+:Configuration:
+
+    Include ``HFTransformersNLP`` component upstream.
+
+    .. code-block:: yaml
+
+        pipeline:
+        - name: "HFTransformersNLP"
+          model_name: # name of language model to use
+        - name: "LanguageModelTokenizer"
+
+
 
 Entity Extractors
 -----------------
 
@@ -17,11 +17,7 @@
 
 MESSAGE_ATTRIBUTES = [TEXT, INTENT, RESPONSE]
 
-TOKENS_NAMES = {
-    TEXT: "tokens",
-    INTENT: "intent_tokens",
-    RESPONSE: "response_tokens",
-}
+TOKENS_NAMES = {TEXT: "tokens", INTENT: "intent_tokens", RESPONSE: "response_tokens"}
 
 SPARSE_FEATURE_NAMES = {
     TEXT: "text_sparse_features",
@@ -35,7 +31,18 @@
     RESPONSE: "response_dense_features",
 }
 
-SPACY_DOCS = {TEXT: "spacy_doc", RESPONSE: "response_spacy_doc"}
+LANGUAGE_MODEL_DOCS = {
+    TEXT: "text_language_model_doc",
+    RESPONSE: "response_language_model_doc",
+}
+
+TOKEN_IDS = "token_ids"
+TOKENS = "tokens"
+SEQUENCE_FEATURES = "sequence_features"
+SENTENCE_FEATURES = "sentence_features"
+
+SPACY_DOCS = {TEXT: "text_spacy_doc", RESPONSE: "response_spacy_doc"}
+
 
 DENSE_FEATURIZABLE_ATTRIBUTES = [TEXT, RESPONSE]
 
 
@@ -0,0 +1,62 @@
+import numpy as np
+from typing import Any, Optional, Text
+
+from rasa.nlu.config import RasaNLUModelConfig
+from rasa.nlu.featurizers.featurizer import Featurizer
+from rasa.nlu.training_data import Message, TrainingData
+
+from rasa.nlu.constants import (
+    TEXT,
+    LANGUAGE_MODEL_DOCS,
+    DENSE_FEATURE_NAMES,
+    DENSE_FEATURIZABLE_ATTRIBUTES,
+    TOKENS_NAMES,
+    SEQUENCE_FEATURES,
+    SENTENCE_FEATURES,
+)
+
+
+class LanguageModelFeaturizer(Featurizer):
+
+    provides = [
+        DENSE_FEATURE_NAMES[attribute] for attribute in DENSE_FEATURIZABLE_ATTRIBUTES
+    ]
+
+    requires = [
+        LANGUAGE_MODEL_DOCS[attribute] for attribute in DENSE_FEATURIZABLE_ATTRIBUTES
+    ] + [TOKENS_NAMES[attribute] for attribute in DENSE_FEATURIZABLE_ATTRIBUTES]
+
+    def train(
+        self,
+        training_data: TrainingData,
+        config: Optional[RasaNLUModelConfig] = None,
+        **kwargs: Any,
+    ) -> None:
+
+        for example in training_data.training_examples:
+            for attribute in DENSE_FEATURIZABLE_ATTRIBUTES:
+                self._set_lm_features(example, attribute)
+
+    def get_doc(self, message: Message, attribute: Text) -> Any:
+
+        return message.get(LANGUAGE_MODEL_DOCS[attribute])
+
+    def process(self, message: Message, **kwargs: Any) -> None:
+
+        self._set_lm_features(message)
+
+    def _set_lm_features(self, message: Message, attribute: Text = TEXT):
+        """Adds the precomputed word vectors to the messages features."""
+
+        doc = self.get_doc(message, attribute)
+
+        if doc is not None:
+            sequence_features = doc[SEQUENCE_FEATURES]
+            sentence_features = doc[SENTENCE_FEATURES]
+
+            features = np.concatenate([sequence_features, sentence_features])
+
+            features = self._combine_with_existing_dense_features(
+                message, features, DENSE_FEATURE_NAMES[attribute]
+            )
+            message.set(DENSE_FEATURE_NAMES[attribute], features)
@@ -29,6 +29,7 @@
 from rasa.nlu.featurizers.sparse_featurizer.count_vectors_featurizer import (
     CountVectorsFeaturizer,
 )
+from rasa.nlu.featurizers.dense_featurizer.lm_featurizer import LanguageModelFeaturizer
 from rasa.nlu.featurizers.sparse_featurizer.regex_featurizer import RegexFeaturizer
 from rasa.nlu.model import Metadata
 from rasa.nlu.selectors.response_selector import ResponseSelector
@@ -37,8 +38,10 @@
 from rasa.nlu.tokenizers.mitie_tokenizer import MitieTokenizer
 from rasa.nlu.tokenizers.spacy_tokenizer import SpacyTokenizer
 from rasa.nlu.tokenizers.whitespace_tokenizer import WhitespaceTokenizer
+from rasa.nlu.tokenizers.lm_tokenizer import LanguageModelTokenizer
 from rasa.nlu.utils.mitie_utils import MitieNLP
 from rasa.nlu.utils.spacy_utils import SpacyNLP
+from rasa.nlu.utils.hugging_face.hf_transformers import HFTransformersNLP
 from rasa.utils.common import class_from_module_path, raise_warning
 from rasa.utils.tensorflow.constants import (
     INTENT_CLASSIFICATION,
@@ -59,12 +62,14 @@
     # utils
     SpacyNLP,
     MitieNLP,
+    HFTransformersNLP,
     # tokenizers
     MitieTokenizer,
     SpacyTokenizer,
     WhitespaceTokenizer,
     ConveRTTokenizer,
     JiebaTokenizer,
+    LanguageModelTokenizer,
     # extractors
     SpacyEntityExtractor,
     MitieEntityExtractor,
@@ -78,6 +83,7 @@
     LexicalSyntacticFeaturizer,
     CountVectorsFeaturizer,
     ConveRTFeaturizer,
+    LanguageModelFeaturizer,
     # classifiers
     SklearnIntentClassifier,
     MitieIntentClassifier,
 
@@ -4,6 +4,7 @@
 from rasa.nlu.tokenizers.whitespace_tokenizer import WhitespaceTokenizer
 from rasa.nlu.training_data import Message
 from rasa.nlu.constants import MESSAGE_ATTRIBUTES, TOKENS_NAMES
+import rasa.utils.train_utils as train_utils
 import tensorflow as tf
 
 
@@ -69,10 +70,9 @@ def tokenize(self, message: Message, attribute: Text) -> List[Token]:
             # clean tokens (remove special chars and empty tokens)
             split_token_strings = self._clean_tokens(split_token_strings)
 
-            _aligned_tokens = self._align_tokens(
+            tokens_out += train_utils.align_tokens(
                 split_token_strings, token_end, token_start
             )
-            tokens_out += _aligned_tokens
 
         return tokens_out
 
@@ -81,38 +81,3 @@ def _clean_tokens(self, tokens: List[bytes]):
 
         tokens = [string.decode("utf-8").replace("﹏", "") for string in tokens]
         return [string for string in tokens if string]
-
-    def _align_tokens(self, tokens_in: List[Text], token_end: int, token_start: int):
-        """Align sub-tokens of ConveRT with tokens return by the WhitespaceTokenizer.
-
-        As ConveRT might split a single word into multiple tokens, we need to make
-        sure that the start and end value of first and last sub-token matches the
-        start and end value of the token return by the WhitespaceTokenizer as the
-        entities are using those start and end values.
-        """
-
-        tokens_out = []
-
-        current_token_offset = token_start
-
-        for index, string in enumerate(tokens_in):
-            if index == 0:
-                if index == len(tokens_in) - 1:
-                    s_token_end = token_end
-                else:
-                    s_token_end = current_token_offset + len(string)
-                tokens_out.append(Token(string, token_start, end=s_token_end))
-            elif index == len(tokens_in) - 1:
-                tokens_out.append(Token(string, current_token_offset, end=token_end))
-            else:
-                tokens_out.append(
-                    Token(
-                        string,
-                        current_token_offset,
-                        end=current_token_offset + len(string),
-                    )
-                )
-
-            current_token_offset += len(string)
-
-        return tokens_out
@@ -0,0 +1,36 @@
+from typing import Text, List, Any, Dict
+
+from rasa.nlu.tokenizers.tokenizer import Token, Tokenizer
+from rasa.nlu.training_data import Message
+
+from rasa.nlu.constants import (
+    TOKENS_NAMES,
+    LANGUAGE_MODEL_DOCS,
+    DENSE_FEATURIZABLE_ATTRIBUTES,
+    MESSAGE_ATTRIBUTES,
+    TOKENS,
+)
+
+
+class LanguageModelTokenizer(Tokenizer):
+
+    provides = [TOKENS_NAMES[attribute] for attribute in MESSAGE_ATTRIBUTES]
+
+    requires = [
+        LANGUAGE_MODEL_DOCS[attribute] for attribute in DENSE_FEATURIZABLE_ATTRIBUTES
+    ]
+
+    defaults = {
+        # Flag to check whether to split intents
+        "intent_tokenization_flag": False,
+        # Symbol on which intent should be split
+        "intent_split_symbol": "_",
+    }
+
+    def get_doc(self, message: Message, attribute: Text) -> Dict[Text, Any]:
+        return message.get(LANGUAGE_MODEL_DOCS[attribute])
+
+    def tokenize(self, message: Message, attribute: Text) -> List[Token]:
+        doc = self.get_doc(message, attribute)
+
+        return doc[TOKENS]