Skip to content

Commit b364c13

Browse files
committed
Add create_features to Featurizer code
1 parent 9359f27 commit b364c13

File tree

1 file changed

+50
-5
lines changed

1 file changed

+50
-5
lines changed

rasa_nlu/featurizers/bert_featurizer.py

Lines changed: 50 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -4,15 +4,12 @@
44
from __future__ import unicode_literals
55

66
import numpy as np
7-
from typing import Any
87
import os
98

109
from rasa_nlu.featurizers import Featurizer
11-
from rasa_nlu.training_data import Message
12-
from rasa_nlu.training_data import TrainingData
1310
from rasa_nlu import config
1411
from bert import modeling, tokenization
15-
from bert.extract_features import create_features, model_fn_builder
12+
from bert.extract_features import *
1613

1714
import tensorflow as tf
1815

@@ -92,8 +89,56 @@ def process(self, message, **kwargs):
9289
def _set_bert_features(self, message):
9390
"""Adds the spacy word vectors to the messages text features."""
9491
# print(message)
95-
fs = create_features([message.text], self.estimator, self.tokenizer, self.layer_indexes)
92+
fs = self.create_features([message.text], self.estimator, self.tokenizer, self.layer_indexes)
9693
feats = [x['layers'][0]['values'] for x in fs[0]['features'][1:-1]]
9794
features = np.average(feats, axis=0)
9895
# features = np.array(fs[0]['features'][0]['layers'][0]['values'])
9996
message.set("text_features", features)
97+
98+
@staticmethod
99+
def create_features(examples_array, estimator, tokenizer, layer_indexes):
100+
examples = read_array_examples(examples_array)
101+
102+
features = convert_examples_to_features(
103+
examples=examples, seq_length=128, tokenizer=tokenizer)
104+
105+
unique_id_to_feature = {}
106+
for feature in features:
107+
unique_id_to_feature[feature.unique_id] = feature
108+
109+
input_fn = input_fn_builder(
110+
features=features, seq_length=128)
111+
112+
if len(examples_array) > 1:
113+
save_hook = tf.train.CheckpointSaverHook('/tmp/bert_model', save_secs=1)
114+
predictions = estimator.predict(input_fn,
115+
hooks=[save_hook],
116+
yield_single_examples=True)
117+
else:
118+
predictions = estimator.predict(input_fn, yield_single_examples=True)
119+
120+
results = []
121+
122+
for result in predictions:
123+
unique_id = int(result["unique_id"])
124+
feature = unique_id_to_feature[unique_id]
125+
output_json = collections.OrderedDict()
126+
output_json["linex_index"] = unique_id
127+
all_features = []
128+
for (i, token) in enumerate(feature.tokens):
129+
all_layers = []
130+
for (j, layer_index) in enumerate(layer_indexes):
131+
layer_output = result["layer_output_%d" % j]
132+
layers = collections.OrderedDict()
133+
layers["index"] = layer_index
134+
layers["values"] = [
135+
round(float(x), 6) for x in layer_output[i:(i + 1)].flat
136+
]
137+
all_layers.append(layers)
138+
features = collections.OrderedDict()
139+
features["token"] = token
140+
features["layers"] = all_layers
141+
all_features.append(features)
142+
output_json["features"] = all_features
143+
results.append(output_json)
144+
return results

0 commit comments

Comments
 (0)