add bert stuff

akelad · akelad · commit ce1845fdbca7 · 2019-01-30T15:11:14.000+01:00
diff --git a/rasa_nlu/featurizers/bert_featurizer.py b/rasa_nlu/featurizers/bert_featurizer.py
@@ -0,0 +1,103 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import numpy as np
+import typing
+from typing import Any
+
+from rasa_nlu.featurizers import Featurizer
+from rasa_nlu.training_data import Message
+from rasa_nlu.training_data import TrainingData
+from extract_features import main, create_features, model_fn_builder
+from rasa_nlu import config
+
+import tensorflow as tf
+import modeling
+import tokenization
+
+if typing.TYPE_CHECKING:
+    from spacy.language import Language
+    from spacy.tokens import Doc
+
+tf.logging.set_verbosity(tf.logging.INFO)
+
+def ndim(spacy_nlp):
+    """Number of features used to represent a document / sentence."""
+    # type: Language -> int
+    return spacy_nlp.vocab.vectors_length
+
+
+def features_for_doc(doc):
+    """Feature vector for a single document / sentence."""
+    # type: Doc -> np.ndarray
+    return doc.vector
+
+
+class BertFeaturizer(Featurizer):
+    name = "intent_featurizer_bert"
+
+    provides = ["text_features"]
+
+    requires = []
+
+    def __init__(self, component_config=None):
+        if not component_config:
+            component_config = {}
+
+        # makes sure the name of the configuration is part of the config
+        # this is important for e.g. persistence
+        component_config["name"] = self.name
+        print("hi")
+        self.component_config = config.override_defaults(
+                self.defaults, component_config)
+
+        self.partial_processing_pipeline = None
+        self.partial_processing_context = None
+        self.layer_indexes = [-1]
+        bert_config = modeling.BertConfig.from_json_file("/Users/oakela/Documents/RASA/bert/uncased_L-24_H-1024_A-16/bert_config.json")
+        self.tokenizer = tokenization.FullTokenizer(vocab_file="/Users/oakela/Documents/RASA/bert/uncased_L-24_H-1024_A-16/vocab.txt", do_lower_case=True)
+        is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2
+        run_config = tf.contrib.tpu.RunConfig(
+            master=None,
+            tpu_config=tf.contrib.tpu.TPUConfig(
+                num_shards=8,
+                per_host_input_for_training=is_per_host))
+        model_fn = model_fn_builder(
+          bert_config=bert_config,
+          init_checkpoint="/Users/oakela/Documents/RASA/bert/uncased_L-24_H-1024_A-16/bert_model.ckpt.index",
+          layer_indexes=self.layer_indexes,
+          use_tpu=False,
+          use_one_hot_embeddings=False)
+
+        self.estimator = tf.contrib.tpu.TPUEstimator(
+           use_tpu=False,
+           model_fn=model_fn,
+           config=run_config,
+           predict_batch_size=8)
+
+    def train(self, training_data, config, **kwargs):
+        # type: (TrainingData) -> None
+        messages = [example.text for example in training_data.intent_examples]
+        fs = create_features(messages, self.estimator, self.tokenizer, self.layer_indexes)
+        features = []
+        for x in fs:
+            feats = [y['layers'][0]['values'] for y in x['features'][1:-1]]
+            features.append(np.average(feats, axis=0))
+        for i, message in enumerate(training_data.intent_examples):
+            message.set("text_features", features[i])
+            # self._set_bert_features(example)
+
+    def process(self, message, **kwargs):
+        # type: (Message, **Any) -> None
+
+        self._set_bert_features(message)
+
+    def _set_bert_features(self, message):
+        """Adds the spacy word vectors to the messages text features."""
+        # print(message)
+        fs = create_features([message.text], self.estimator, self.tokenizer, self.layer_indexes)
+        feats = [x['layers'][0]['values'] for x in fs[0]['features'][1:-1]]
+        features = np.average(feats, axis=0)
+        message.set("text_features", features)
diff --git a/rasa_nlu/registry.py b/rasa_nlu/registry.py
@@ -33,6 +33,7 @@
 from rasa_nlu.featurizers.spacy_featurizer import SpacyFeaturizer
 from rasa_nlu.featurizers.count_vectors_featurizer import \
     CountVectorsFeaturizer
+from rasa_nlu.featurizers.bert_featurizer import BertFeaturizer
 from rasa_nlu.model import Metadata
 from rasa_nlu.tokenizers.mitie_tokenizer import MitieTokenizer
 from rasa_nlu.tokenizers.spacy_tokenizer import SpacyTokenizer
@@ -53,7 +54,7 @@
     CRFEntityExtractor, DucklingHTTPExtractor,
     EntitySynonymMapper,
     SpacyFeaturizer, MitieFeaturizer, NGramFeaturizer, RegexFeaturizer,
-    CountVectorsFeaturizer,
+    CountVectorsFeaturizer, BertFeaturizer,
     MitieTokenizer, SpacyTokenizer, WhitespaceTokenizer, JiebaTokenizer,
     SklearnIntentClassifier, MitieIntentClassifier, KeywordIntentClassifier,
     EmbeddingIntentClassifier
diff --git a/rasa_nlu/utils/bert_utils.py b/rasa_nlu/utils/bert_utils.py
@@ -0,0 +1,131 @@
+import logging
+import typing
+from typing import Any, Dict, List, Optional, Text
+
+from rasa_nlu.components import Component
+from rasa_nlu.config import RasaNLUModelConfig
+from rasa_nlu.training_data import Message, TrainingData
+
+logger = logging.getLogger(__name__)
+
+if typing.TYPE_CHECKING:
+    from spacy.language import Language
+    from spacy.tokens.doc import Doc
+    from rasa_nlu.model import Metadata
+
+
+class SpacyNLP(Component):
+    name = "nlp_spacy"
+
+    provides = ["spacy_doc", "spacy_nlp"]
+
+    defaults = {
+        # name of the language model to load - if it is not set
+        # we will be looking for a language model that is named
+        # after the language of the model, e.g. `en`
+        "model": None,
+
+        # when retrieving word vectors, this will decide if the casing
+        # of the word is relevant. E.g. `hello` and `Hello` will
+        # retrieve the same vector, if set to `False`. For some
+        # applications and models it makes sense to differentiate
+        # between these two words, therefore setting this to `True`.
+        "case_sensitive": False,
+    }
+
+    def __init__(self,
+                 component_config: Dict[Text, Any] = None,
+                 nlp: 'Language' = None) -> None:
+
+        self.nlp = nlp
+        super(SpacyNLP, self).__init__(component_config)
+
+    @classmethod
+    def required_packages(cls) -> List[Text]:
+        return ["spacy"]
+
+    @classmethod
+    def create(cls, cfg: RasaNLUModelConfig) -> 'SpacyNLP':
+        import spacy
+
+        component_conf = cfg.for_component(cls.name, cls.defaults)
+        spacy_model_name = component_conf.get("model")
+
+        # if no model is specified, we fall back to the language string
+        if not spacy_model_name:
+            spacy_model_name = cfg.language
+            component_conf["model"] = cfg.language
+
+        logger.info("Trying to load spacy model with "
+                    "name '{}'".format(spacy_model_name))
+
+        nlp = spacy.load(spacy_model_name, disable=['parser'])
+        cls.ensure_proper_language_model(nlp)
+        return SpacyNLP(component_conf, nlp)
+
+    @classmethod
+    def cache_key(cls, model_metadata: 'Metadata') -> Text:
+
+        component_meta = model_metadata.for_component(cls.name)
+
+        # Fallback, use the language name, e.g. "en",
+        # as the model name if no explicit name is defined
+        spacy_model_name = component_meta.get("model", model_metadata.language)
+
+        return cls.name + "-" + spacy_model_name
+
+    def provide_context(self) -> Dict[Text, Any]:
+        return {"spacy_nlp": self.nlp}
+
+    def doc_for_text(self, text: Text) -> 'Doc':
+        if self.component_config.get("case_sensitive"):
+            return self.nlp(text)
+        else:
+            return self.nlp(text.lower())
+
+    def train(self,
+              training_data: TrainingData,
+              config: RasaNLUModelConfig,
+              **kwargs: Any) -> None:
+
+        for example in training_data.training_examples:
+            example.set("spacy_doc", self.doc_for_text(example.text))
+
+    def process(self, message: Message, **kwargs: Any) -> None:
+
+        message.set("spacy_doc", self.doc_for_text(message.text))
+
+    @classmethod
+    def load(cls,
+             model_dir: Text = None,
+             model_metadata: 'Metadata' = None,
+             cached_component: Optional['SpacyNLP'] = None,
+             **kwargs: Any) -> 'SpacyNLP':
+        import spacy
+
+        if cached_component:
+            return cached_component
+
+        component_meta = model_metadata.for_component(cls.name)
+        model_name = component_meta.get("model")
+
+        nlp = spacy.load(model_name, disable=['parser'])
+        cls.ensure_proper_language_model(nlp)
+        return cls(component_meta, nlp)
+
+    @staticmethod
+    def ensure_proper_language_model(nlp: Optional['Language']) -> None:
+        """Checks if the spacy language model is properly loaded.
+        Raises an exception if the model is invalid."""
+
+        if nlp is None:
+            raise Exception("Failed to load spacy language model. "
+                            "Loading the model returned 'None'.")
+        if nlp.path is None:
+            # Spacy sets the path to `None` if
+            # it did not load the model from disk.
+            # In this case `nlp` is an unusable stub.
+            raise Exception("Failed to load spacy language model for "
+                            "lang '{}'. Make sure you have downloaded the "
+                            "correct model (https://spacy.io/docs/usage/)."
+                            "".format(nlp.lang))