|
4 | 4 | from __future__ import unicode_literals
|
5 | 5 |
|
6 | 6 | import numpy as np
|
7 |
| -from typing import Any |
8 | 7 | import os
|
9 | 8 |
|
10 | 9 | from rasa_nlu.featurizers import Featurizer
|
11 |
| -from rasa_nlu.training_data import Message |
12 |
| -from rasa_nlu.training_data import TrainingData |
13 | 10 | from rasa_nlu import config
|
14 | 11 | from bert import modeling, tokenization
|
15 |
| -from bert.extract_features import create_features, model_fn_builder |
| 12 | +from bert.extract_features import * |
16 | 13 |
|
17 | 14 | import tensorflow as tf
|
18 | 15 |
|
@@ -92,8 +89,56 @@ def process(self, message, **kwargs):
|
92 | 89 | def _set_bert_features(self, message):
|
93 | 90 | """Adds the spacy word vectors to the messages text features."""
|
94 | 91 | # print(message)
|
95 |
| - fs = create_features([message.text], self.estimator, self.tokenizer, self.layer_indexes) |
| 92 | + fs = self.create_features([message.text], self.estimator, self.tokenizer, self.layer_indexes) |
96 | 93 | feats = [x['layers'][0]['values'] for x in fs[0]['features'][1:-1]]
|
97 | 94 | features = np.average(feats, axis=0)
|
98 | 95 | # features = np.array(fs[0]['features'][0]['layers'][0]['values'])
|
99 | 96 | message.set("text_features", features)
|
| 97 | + |
| 98 | + @staticmethod |
| 99 | + def create_features(examples_array, estimator, tokenizer, layer_indexes): |
| 100 | + examples = read_array_examples(examples_array) |
| 101 | + |
| 102 | + features = convert_examples_to_features( |
| 103 | + examples=examples, seq_length=128, tokenizer=tokenizer) |
| 104 | + |
| 105 | + unique_id_to_feature = {} |
| 106 | + for feature in features: |
| 107 | + unique_id_to_feature[feature.unique_id] = feature |
| 108 | + |
| 109 | + input_fn = input_fn_builder( |
| 110 | + features=features, seq_length=128) |
| 111 | + |
| 112 | + if len(examples_array) > 1: |
| 113 | + save_hook = tf.train.CheckpointSaverHook('/tmp/bert_model', save_secs=1) |
| 114 | + predictions = estimator.predict(input_fn, |
| 115 | + hooks=[save_hook], |
| 116 | + yield_single_examples=True) |
| 117 | + else: |
| 118 | + predictions = estimator.predict(input_fn, yield_single_examples=True) |
| 119 | + |
| 120 | + results = [] |
| 121 | + |
| 122 | + for result in predictions: |
| 123 | + unique_id = int(result["unique_id"]) |
| 124 | + feature = unique_id_to_feature[unique_id] |
| 125 | + output_json = collections.OrderedDict() |
| 126 | + output_json["linex_index"] = unique_id |
| 127 | + all_features = [] |
| 128 | + for (i, token) in enumerate(feature.tokens): |
| 129 | + all_layers = [] |
| 130 | + for (j, layer_index) in enumerate(layer_indexes): |
| 131 | + layer_output = result["layer_output_%d" % j] |
| 132 | + layers = collections.OrderedDict() |
| 133 | + layers["index"] = layer_index |
| 134 | + layers["values"] = [ |
| 135 | + round(float(x), 6) for x in layer_output[i:(i + 1)].flat |
| 136 | + ] |
| 137 | + all_layers.append(layers) |
| 138 | + features = collections.OrderedDict() |
| 139 | + features["token"] = token |
| 140 | + features["layers"] = all_layers |
| 141 | + all_features.append(features) |
| 142 | + output_json["features"] = all_features |
| 143 | + results.append(output_json) |
| 144 | + return results |
0 commit comments