Skip to content

Commit ce1845f

Browse files
committed
add bert stuff
1 parent 6966025 commit ce1845f

File tree

3 files changed

+236
-1
lines changed

3 files changed

+236
-1
lines changed
Lines changed: 103 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,103 @@
1+
from __future__ import absolute_import
2+
from __future__ import division
3+
from __future__ import print_function
4+
from __future__ import unicode_literals
5+
6+
import numpy as np
7+
import typing
8+
from typing import Any
9+
10+
from rasa_nlu.featurizers import Featurizer
11+
from rasa_nlu.training_data import Message
12+
from rasa_nlu.training_data import TrainingData
13+
from extract_features import main, create_features, model_fn_builder
14+
from rasa_nlu import config
15+
16+
import tensorflow as tf
17+
import modeling
18+
import tokenization
19+
20+
if typing.TYPE_CHECKING:
21+
from spacy.language import Language
22+
from spacy.tokens import Doc
23+
24+
tf.logging.set_verbosity(tf.logging.INFO)
25+
26+
def ndim(spacy_nlp):
27+
"""Number of features used to represent a document / sentence."""
28+
# type: Language -> int
29+
return spacy_nlp.vocab.vectors_length
30+
31+
32+
def features_for_doc(doc):
33+
"""Feature vector for a single document / sentence."""
34+
# type: Doc -> np.ndarray
35+
return doc.vector
36+
37+
38+
class BertFeaturizer(Featurizer):
39+
name = "intent_featurizer_bert"
40+
41+
provides = ["text_features"]
42+
43+
requires = []
44+
45+
def __init__(self, component_config=None):
46+
if not component_config:
47+
component_config = {}
48+
49+
# makes sure the name of the configuration is part of the config
50+
# this is important for e.g. persistence
51+
component_config["name"] = self.name
52+
print("hi")
53+
self.component_config = config.override_defaults(
54+
self.defaults, component_config)
55+
56+
self.partial_processing_pipeline = None
57+
self.partial_processing_context = None
58+
self.layer_indexes = [-1]
59+
bert_config = modeling.BertConfig.from_json_file("/Users/oakela/Documents/RASA/bert/uncased_L-24_H-1024_A-16/bert_config.json")
60+
self.tokenizer = tokenization.FullTokenizer(vocab_file="/Users/oakela/Documents/RASA/bert/uncased_L-24_H-1024_A-16/vocab.txt", do_lower_case=True)
61+
is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2
62+
run_config = tf.contrib.tpu.RunConfig(
63+
master=None,
64+
tpu_config=tf.contrib.tpu.TPUConfig(
65+
num_shards=8,
66+
per_host_input_for_training=is_per_host))
67+
model_fn = model_fn_builder(
68+
bert_config=bert_config,
69+
init_checkpoint="/Users/oakela/Documents/RASA/bert/uncased_L-24_H-1024_A-16/bert_model.ckpt.index",
70+
layer_indexes=self.layer_indexes,
71+
use_tpu=False,
72+
use_one_hot_embeddings=False)
73+
74+
self.estimator = tf.contrib.tpu.TPUEstimator(
75+
use_tpu=False,
76+
model_fn=model_fn,
77+
config=run_config,
78+
predict_batch_size=8)
79+
80+
def train(self, training_data, config, **kwargs):
81+
# type: (TrainingData) -> None
82+
messages = [example.text for example in training_data.intent_examples]
83+
fs = create_features(messages, self.estimator, self.tokenizer, self.layer_indexes)
84+
features = []
85+
for x in fs:
86+
feats = [y['layers'][0]['values'] for y in x['features'][1:-1]]
87+
features.append(np.average(feats, axis=0))
88+
for i, message in enumerate(training_data.intent_examples):
89+
message.set("text_features", features[i])
90+
# self._set_bert_features(example)
91+
92+
def process(self, message, **kwargs):
93+
# type: (Message, **Any) -> None
94+
95+
self._set_bert_features(message)
96+
97+
def _set_bert_features(self, message):
98+
"""Adds the spacy word vectors to the messages text features."""
99+
# print(message)
100+
fs = create_features([message.text], self.estimator, self.tokenizer, self.layer_indexes)
101+
feats = [x['layers'][0]['values'] for x in fs[0]['features'][1:-1]]
102+
features = np.average(feats, axis=0)
103+
message.set("text_features", features)

rasa_nlu/registry.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@
3333
from rasa_nlu.featurizers.spacy_featurizer import SpacyFeaturizer
3434
from rasa_nlu.featurizers.count_vectors_featurizer import \
3535
CountVectorsFeaturizer
36+
from rasa_nlu.featurizers.bert_featurizer import BertFeaturizer
3637
from rasa_nlu.model import Metadata
3738
from rasa_nlu.tokenizers.mitie_tokenizer import MitieTokenizer
3839
from rasa_nlu.tokenizers.spacy_tokenizer import SpacyTokenizer
@@ -53,7 +54,7 @@
5354
CRFEntityExtractor, DucklingHTTPExtractor,
5455
EntitySynonymMapper,
5556
SpacyFeaturizer, MitieFeaturizer, NGramFeaturizer, RegexFeaturizer,
56-
CountVectorsFeaturizer,
57+
CountVectorsFeaturizer, BertFeaturizer,
5758
MitieTokenizer, SpacyTokenizer, WhitespaceTokenizer, JiebaTokenizer,
5859
SklearnIntentClassifier, MitieIntentClassifier, KeywordIntentClassifier,
5960
EmbeddingIntentClassifier

rasa_nlu/utils/bert_utils.py

Lines changed: 131 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,131 @@
1+
import logging
2+
import typing
3+
from typing import Any, Dict, List, Optional, Text
4+
5+
from rasa_nlu.components import Component
6+
from rasa_nlu.config import RasaNLUModelConfig
7+
from rasa_nlu.training_data import Message, TrainingData
8+
9+
logger = logging.getLogger(__name__)
10+
11+
if typing.TYPE_CHECKING:
12+
from spacy.language import Language
13+
from spacy.tokens.doc import Doc
14+
from rasa_nlu.model import Metadata
15+
16+
17+
class SpacyNLP(Component):
18+
name = "nlp_spacy"
19+
20+
provides = ["spacy_doc", "spacy_nlp"]
21+
22+
defaults = {
23+
# name of the language model to load - if it is not set
24+
# we will be looking for a language model that is named
25+
# after the language of the model, e.g. `en`
26+
"model": None,
27+
28+
# when retrieving word vectors, this will decide if the casing
29+
# of the word is relevant. E.g. `hello` and `Hello` will
30+
# retrieve the same vector, if set to `False`. For some
31+
# applications and models it makes sense to differentiate
32+
# between these two words, therefore setting this to `True`.
33+
"case_sensitive": False,
34+
}
35+
36+
def __init__(self,
37+
component_config: Dict[Text, Any] = None,
38+
nlp: 'Language' = None) -> None:
39+
40+
self.nlp = nlp
41+
super(SpacyNLP, self).__init__(component_config)
42+
43+
@classmethod
44+
def required_packages(cls) -> List[Text]:
45+
return ["spacy"]
46+
47+
@classmethod
48+
def create(cls, cfg: RasaNLUModelConfig) -> 'SpacyNLP':
49+
import spacy
50+
51+
component_conf = cfg.for_component(cls.name, cls.defaults)
52+
spacy_model_name = component_conf.get("model")
53+
54+
# if no model is specified, we fall back to the language string
55+
if not spacy_model_name:
56+
spacy_model_name = cfg.language
57+
component_conf["model"] = cfg.language
58+
59+
logger.info("Trying to load spacy model with "
60+
"name '{}'".format(spacy_model_name))
61+
62+
nlp = spacy.load(spacy_model_name, disable=['parser'])
63+
cls.ensure_proper_language_model(nlp)
64+
return SpacyNLP(component_conf, nlp)
65+
66+
@classmethod
67+
def cache_key(cls, model_metadata: 'Metadata') -> Text:
68+
69+
component_meta = model_metadata.for_component(cls.name)
70+
71+
# Fallback, use the language name, e.g. "en",
72+
# as the model name if no explicit name is defined
73+
spacy_model_name = component_meta.get("model", model_metadata.language)
74+
75+
return cls.name + "-" + spacy_model_name
76+
77+
def provide_context(self) -> Dict[Text, Any]:
78+
return {"spacy_nlp": self.nlp}
79+
80+
def doc_for_text(self, text: Text) -> 'Doc':
81+
if self.component_config.get("case_sensitive"):
82+
return self.nlp(text)
83+
else:
84+
return self.nlp(text.lower())
85+
86+
def train(self,
87+
training_data: TrainingData,
88+
config: RasaNLUModelConfig,
89+
**kwargs: Any) -> None:
90+
91+
for example in training_data.training_examples:
92+
example.set("spacy_doc", self.doc_for_text(example.text))
93+
94+
def process(self, message: Message, **kwargs: Any) -> None:
95+
96+
message.set("spacy_doc", self.doc_for_text(message.text))
97+
98+
@classmethod
99+
def load(cls,
100+
model_dir: Text = None,
101+
model_metadata: 'Metadata' = None,
102+
cached_component: Optional['SpacyNLP'] = None,
103+
**kwargs: Any) -> 'SpacyNLP':
104+
import spacy
105+
106+
if cached_component:
107+
return cached_component
108+
109+
component_meta = model_metadata.for_component(cls.name)
110+
model_name = component_meta.get("model")
111+
112+
nlp = spacy.load(model_name, disable=['parser'])
113+
cls.ensure_proper_language_model(nlp)
114+
return cls(component_meta, nlp)
115+
116+
@staticmethod
117+
def ensure_proper_language_model(nlp: Optional['Language']) -> None:
118+
"""Checks if the spacy language model is properly loaded.
119+
Raises an exception if the model is invalid."""
120+
121+
if nlp is None:
122+
raise Exception("Failed to load spacy language model. "
123+
"Loading the model returned 'None'.")
124+
if nlp.path is None:
125+
# Spacy sets the path to `None` if
126+
# it did not load the model from disk.
127+
# In this case `nlp` is an unusable stub.
128+
raise Exception("Failed to load spacy language model for "
129+
"lang '{}'. Make sure you have downloaded the "
130+
"correct model (https://spacy.io/docs/usage/)."
131+
"".format(nlp.lang))

0 commit comments

Comments
 (0)