diff --git a/run_squad_sp.py b/run_squad_sp.py deleted file mode 100644 index cfc1ce1e..00000000 --- a/run_squad_sp.py +++ /dev/null @@ -1,1330 +0,0 @@ -# coding=utf-8 -# Copyright 2018 The Google AI Team Authors. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Lint as: python2, python3 -"""Run ALBERT on SQuAD 1.1 and SQuAD 2.0. using sentence piece tokenization.""" -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -import collections -import json -import math -import os -import random -import modeling -import optimization -import tokenization -import numpy as np -import six -from six.moves import map -from six.moves import range -import tensorflow as tf -from tensorflow.contrib import cluster_resolver as contrib_cluster_resolver -from tensorflow.contrib import data as contrib_data -from tensorflow.contrib import tpu as contrib_tpu - -# pylint: disable=g-import-not-at-top -if six.PY2: - import six.moves.cPickle as pickle -else: - import pickle -# pylint: enable=g-import-not-at-top - -flags = tf.flags - -FLAGS = flags.FLAGS - -## Required parameters -flags.DEFINE_string( - "albert_config_file", None, - "The config json file corresponding to the pre-trained ALBERT model. " - "This specifies the model architecture.") - -flags.DEFINE_string( - "vocab_file", None, - "The vocabulary file that the ALBERT model was trained on.") - -flags.DEFINE_string("spm_model_file", None, - "The model file for sentence piece tokenization.") - -flags.DEFINE_string( - "output_dir", None, - "The output directory where the model checkpoints will be written.") - -## Other parameters -flags.DEFINE_string("train_file", None, - "SQuAD json for training. E.g., train-v1.1.json") - -flags.DEFINE_string( - "predict_file", None, - "SQuAD json for predictions. E.g., dev-v1.1.json or test-v1.1.json") - -flags.DEFINE_string("train_feature_file", None, - "training feature file.") - -flags.DEFINE_string( - "predict_feature_file", None, - "predict feature file.") - -flags.DEFINE_string( - "predict_feature_left_file", None, - "predict data kept but not pass to tpu.") - -flags.DEFINE_string( - "init_checkpoint", None, - "Initial checkpoint (usually from a pre-trained ALBERT model).") - -flags.DEFINE_bool( - "do_lower_case", True, - "Whether to lower case the input text. Should be True for uncased " - "models and False for cased models.") - -flags.DEFINE_integer( - "max_seq_length", 384, - "The maximum total input sequence length after WordPiece tokenization. " - "Sequences longer than this will be truncated, and sequences shorter " - "than this will be padded.") - -flags.DEFINE_integer( - "doc_stride", 128, - "When splitting up a long document into chunks, how much stride to " - "take between chunks.") - -flags.DEFINE_integer( - "max_query_length", 64, - "The maximum number of tokens for the question. Questions longer than " - "this will be truncated to this length.") - -flags.DEFINE_bool("do_train", False, "Whether to run training.") - -flags.DEFINE_bool("do_predict", False, "Whether to run eval on the dev set.") - -flags.DEFINE_integer("train_batch_size", 32, "Total batch size for training.") - -flags.DEFINE_integer("predict_batch_size", 8, - "Total batch size for predictions.") - -flags.DEFINE_float("learning_rate", 5e-5, "The initial learning rate for Adam.") - -flags.DEFINE_float("num_train_epochs", 3.0, - "Total number of training epochs to perform.") - -flags.DEFINE_float( - "warmup_proportion", 0.1, - "Proportion of training to perform linear learning rate warmup for. " - "E.g., 0.1 = 10% of training.") - -flags.DEFINE_integer("save_checkpoints_steps", 1000, - "How often to save the model checkpoint.") - -flags.DEFINE_integer("iterations_per_loop", 1000, - "How many steps to make in each estimator call.") - -flags.DEFINE_integer( - "n_best_size", 20, - "The total number of n-best predictions to generate in the " - "nbest_predictions.json output file.") - -flags.DEFINE_integer( - "max_answer_length", 30, - "The maximum length of an answer that can be generated. This is needed " - "because the start and end predictions are not conditioned on one another.") - -flags.DEFINE_bool("use_tpu", False, "Whether to use TPU or GPU/CPU.") - -tf.flags.DEFINE_string( - "tpu_name", None, - "The Cloud TPU to use for training. This should be either the name " - "used when creating the Cloud TPU, or a grpc://ip.address.of.tpu:8470 " - "url.") - -tf.flags.DEFINE_string( - "tpu_zone", None, - "[Optional] GCE zone where the Cloud TPU is located in. If not " - "specified, we will attempt to automatically detect the GCE project from " - "metadata.") - -tf.flags.DEFINE_string( - "gcp_project", None, - "[Optional] Project name for the Cloud TPU-enabled project. If not " - "specified, we will attempt to automatically detect the GCE project from " - "metadata.") - -tf.flags.DEFINE_string("master", None, "[Optional] TensorFlow master URL.") - -flags.DEFINE_integer( - "num_tpu_cores", 8, - "Only used if `use_tpu` is True. Total number of TPU cores to use.") - -flags.DEFINE_bool( - "verbose_logging", False, - "If true, all of the warnings related to data processing will be printed. " - "A number of warnings are expected for a normal SQuAD evaluation.") - -flags.DEFINE_bool( - "version_2_with_negative", False, - "If true, the SQuAD examples contain some that do not have an answer.") - -flags.DEFINE_float( - "null_score_diff_threshold", 0.0, - "If null_score - best_non_null is greater than the threshold predict null.") - - -class SquadExample(object): - """A single training/test example for simple sequence classification. - - For examples without an answer, the start and end position are -1. - """ - - def __init__(self, - qas_id, - question_text, - paragraph_text, - orig_answer_text=None, - start_position=None, - end_position=None, - is_impossible=False): - self.qas_id = qas_id - self.question_text = question_text - self.paragraph_text = paragraph_text - self.orig_answer_text = orig_answer_text - self.start_position = start_position - self.end_position = end_position - self.is_impossible = is_impossible - - def __str__(self): - return self.__repr__() - - def __repr__(self): - s = "" - s += "qas_id: %s" % (tokenization.printable_text(self.qas_id)) - s += ", question_text: %s" % ( - tokenization.printable_text(self.question_text)) - s += ", paragraph_text: [%s]" % (" ".join(self.paragraph_text)) - if self.start_position: - s += ", start_position: %d" % (self.start_position) - if self.start_position: - s += ", end_position: %d" % (self.end_position) - if self.start_position: - s += ", is_impossible: %r" % (self.is_impossible) - return s - - -class InputFeatures(object): - """A single set of features of data.""" - - def __init__(self, - unique_id, - example_index, - doc_span_index, - tok_start_to_orig_index, - tok_end_to_orig_index, - token_is_max_context, - tokens, - input_ids, - input_mask, - segment_ids, - paragraph_len, - start_position=None, - end_position=None, - is_impossible=None): - self.unique_id = unique_id - self.example_index = example_index - self.doc_span_index = doc_span_index - self.tok_start_to_orig_index = tok_start_to_orig_index - self.tok_end_to_orig_index = tok_end_to_orig_index - self.token_is_max_context = token_is_max_context - self.tokens = tokens - self.input_ids = input_ids - self.input_mask = input_mask - self.segment_ids = segment_ids - self.paragraph_len = paragraph_len - self.start_position = start_position - self.end_position = end_position - self.is_impossible = is_impossible - - -def read_squad_examples(input_file, is_training): - """Read a SQuAD json file into a list of SquadExample.""" - with tf.gfile.Open(input_file, "r") as reader: - input_data = json.load(reader)["data"] - - examples = [] - for entry in input_data: - for paragraph in entry["paragraphs"]: - paragraph_text = paragraph["context"] - - for qa in paragraph["qas"]: - qas_id = qa["id"] - question_text = qa["question"] - start_position = None - orig_answer_text = None - is_impossible = False - - if is_training: - is_impossible = qa.get("is_impossible", False) - if (len(qa["answers"]) != 1) and (not is_impossible): - raise ValueError( - "For training, each question should have exactly 1 answer.") - if not is_impossible: - answer = qa["answers"][0] - orig_answer_text = answer["text"] - start_position = answer["answer_start"] - else: - start_position = -1 - orig_answer_text = "" - - example = SquadExample( - qas_id=qas_id, - question_text=question_text, - paragraph_text=paragraph_text, - orig_answer_text=orig_answer_text, - start_position=start_position, - is_impossible=is_impossible) - examples.append(example) - - return examples - - -def _convert_index(index, pos, m=None, is_start=True): - """Converts index.""" - if index[pos] is not None: - return index[pos] - n = len(index) - rear = pos - while rear < n - 1 and index[rear] is None: - rear += 1 - front = pos - while front > 0 and index[front] is None: - front -= 1 - assert index[front] is not None or index[rear] is not None - if index[front] is None: - if index[rear] >= 1: - if is_start: - return 0 - else: - return index[rear] - 1 - return index[rear] - if index[rear] is None: - if m is not None and index[front] < m - 1: - if is_start: - return index[front] + 1 - else: - return m - 1 - return index[front] - if is_start: - if index[rear] > index[front] + 1: - return index[front] + 1 - else: - return index[rear] - else: - if index[rear] > index[front] + 1: - return index[rear] - 1 - else: - return index[front] - - -def convert_examples_to_features(examples, tokenizer, max_seq_length, - doc_stride, max_query_length, is_training, - output_fn): - """Loads a data file into a list of `InputBatch`s.""" - - cnt_pos, cnt_neg = 0, 0 - unique_id = 1000000000 - max_n, max_m = 1024, 1024 - f = np.zeros((max_n, max_m), dtype=np.float32) - - for (example_index, example) in enumerate(examples): - - if example_index % 100 == 0: - tf.logging.info("Converting {}/{} pos {} neg {}".format( - example_index, len(examples), cnt_pos, cnt_neg)) - - query_tokens = tokenization.encode_ids( - tokenizer.sp_model, - tokenization.preprocess_text( - example.question_text, lower=FLAGS.do_lower_case)) - - if len(query_tokens) > max_query_length: - query_tokens = query_tokens[0:max_query_length] - - paragraph_text = example.paragraph_text - para_tokens = tokenization.encode_pieces( - tokenizer.sp_model, - tokenization.preprocess_text( - example.paragraph_text, lower=FLAGS.do_lower_case), - return_unicode=False) - - chartok_to_tok_index = [] - tok_start_to_chartok_index = [] - tok_end_to_chartok_index = [] - char_cnt = 0 - for i, token in enumerate(para_tokens): - new_token = six.ensure_binary(token).replace( - tokenization.SPIECE_UNDERLINE, b" ") - chartok_to_tok_index.extend([i] * len(new_token)) - tok_start_to_chartok_index.append(char_cnt) - char_cnt += len(new_token) - tok_end_to_chartok_index.append(char_cnt - 1) - - tok_cat_text = "".join(para_tokens).replace( - tokenization.SPIECE_UNDERLINE.decode("utf-8"), " ") - n, m = len(paragraph_text), len(tok_cat_text) - - if n > max_n or m > max_m: - max_n = max(n, max_n) - max_m = max(m, max_m) - f = np.zeros((max_n, max_m), dtype=np.float32) - - g = {} - - def _lcs_match(max_dist, n=n, m=m): - """Longest-common-substring algorithm.""" - f.fill(0) - g.clear() - - ### longest common sub sequence - # f[i, j] = max(f[i - 1, j], f[i, j - 1], f[i - 1, j - 1] + match(i, j)) - for i in range(n): - - # unlike standard LCS, this is specifically optimized for the setting - # because the mismatch between sentence pieces and original text will - # be small - for j in range(i - max_dist, i + max_dist): - if j >= m or j < 0: continue - - if i > 0: - g[(i, j)] = 0 - f[i, j] = f[i - 1, j] - - if j > 0 and f[i, j - 1] > f[i, j]: - g[(i, j)] = 1 - f[i, j] = f[i, j - 1] - - f_prev = f[i - 1, j - 1] if i > 0 and j > 0 else 0 - if (tokenization.preprocess_text( - paragraph_text[i], lower=FLAGS.do_lower_case, - remove_space=False) == tok_cat_text[j] - and f_prev + 1 > f[i, j]): - g[(i, j)] = 2 - f[i, j] = f_prev + 1 - - max_dist = abs(n - m) + 5 - for _ in range(2): - _lcs_match(max_dist) - if f[n - 1, m - 1] > 0.8 * n: break - max_dist *= 2 - - orig_to_chartok_index = [None] * n - chartok_to_orig_index = [None] * m - i, j = n - 1, m - 1 - while i >= 0 and j >= 0: - if (i, j) not in g: break - if g[(i, j)] == 2: - orig_to_chartok_index[i] = j - chartok_to_orig_index[j] = i - i, j = i - 1, j - 1 - elif g[(i, j)] == 1: - j = j - 1 - else: - i = i - 1 - - if (all(v is None for v in orig_to_chartok_index) or - f[n - 1, m - 1] < 0.8 * n): - tf.logging.info("MISMATCH DETECTED!") - continue - - tok_start_to_orig_index = [] - tok_end_to_orig_index = [] - for i in range(len(para_tokens)): - start_chartok_pos = tok_start_to_chartok_index[i] - end_chartok_pos = tok_end_to_chartok_index[i] - start_orig_pos = _convert_index(chartok_to_orig_index, start_chartok_pos, - n, is_start=True) - end_orig_pos = _convert_index(chartok_to_orig_index, end_chartok_pos, - n, is_start=False) - - tok_start_to_orig_index.append(start_orig_pos) - tok_end_to_orig_index.append(end_orig_pos) - - if not is_training: - tok_start_position = tok_end_position = None - - if is_training and example.is_impossible: - tok_start_position = 0 - tok_end_position = 0 - - if is_training and not example.is_impossible: - start_position = example.start_position - end_position = start_position + len(example.orig_answer_text) - 1 - - start_chartok_pos = _convert_index(orig_to_chartok_index, start_position, - is_start=True) - tok_start_position = chartok_to_tok_index[start_chartok_pos] - - end_chartok_pos = _convert_index(orig_to_chartok_index, end_position, - is_start=False) - tok_end_position = chartok_to_tok_index[end_chartok_pos] - assert tok_start_position <= tok_end_position - - def _piece_to_id(x): - if six.PY2 and isinstance(x, six.text_type): - x = six.ensure_binary(x, "utf-8") - return tokenizer.sp_model.PieceToId(x) - - all_doc_tokens = list(map(_piece_to_id, para_tokens)) - - # The -3 accounts for [CLS], [SEP] and [SEP] - max_tokens_for_doc = max_seq_length - len(query_tokens) - 3 - - # We can have documents that are longer than the maximum sequence length. - # To deal with this we do a sliding window approach, where we take chunks - # of the up to our max length with a stride of `doc_stride`. - _DocSpan = collections.namedtuple( # pylint: disable=invalid-name - "DocSpan", ["start", "length"]) - doc_spans = [] - start_offset = 0 - while start_offset < len(all_doc_tokens): - length = len(all_doc_tokens) - start_offset - if length > max_tokens_for_doc: - length = max_tokens_for_doc - doc_spans.append(_DocSpan(start=start_offset, length=length)) - if start_offset + length == len(all_doc_tokens): - break - start_offset += min(length, doc_stride) - - for (doc_span_index, doc_span) in enumerate(doc_spans): - tokens = [] - token_is_max_context = {} - segment_ids = [] - - cur_tok_start_to_orig_index = [] - cur_tok_end_to_orig_index = [] - - tokens.append(tokenizer.sp_model.PieceToId("[CLS]")) - segment_ids.append(0) - for token in query_tokens: - tokens.append(token) - segment_ids.append(0) - tokens.append(tokenizer.sp_model.PieceToId("[SEP]")) - segment_ids.append(0) - - for i in range(doc_span.length): - split_token_index = doc_span.start + i - - cur_tok_start_to_orig_index.append( - tok_start_to_orig_index[split_token_index]) - cur_tok_end_to_orig_index.append( - tok_end_to_orig_index[split_token_index]) - - is_max_context = _check_is_max_context(doc_spans, doc_span_index, - split_token_index) - token_is_max_context[len(tokens)] = is_max_context - tokens.append(all_doc_tokens[split_token_index]) - segment_ids.append(1) - tokens.append(tokenizer.sp_model.PieceToId("[SEP]")) - segment_ids.append(1) - - paragraph_len = len(tokens) - input_ids = tokens - - # The mask has 1 for real tokens and 0 for padding tokens. Only real - # tokens are attended to. - input_mask = [1] * len(input_ids) - - # Zero-pad up to the sequence length. - while len(input_ids) < max_seq_length: - input_ids.append(0) - input_mask.append(0) - segment_ids.append(0) - - assert len(input_ids) == max_seq_length - assert len(input_mask) == max_seq_length - assert len(segment_ids) == max_seq_length - - span_is_impossible = example.is_impossible - start_position = None - end_position = None - if is_training and not span_is_impossible: - # For training, if our document chunk does not contain an annotation - # we throw it out, since there is nothing to predict. - doc_start = doc_span.start - doc_end = doc_span.start + doc_span.length - 1 - out_of_span = False - if not (tok_start_position >= doc_start and - tok_end_position <= doc_end): - out_of_span = True - if out_of_span: - # continue - start_position = 0 - end_position = 0 - span_is_impossible = True - else: - doc_offset = len(query_tokens) + 2 - start_position = tok_start_position - doc_start + doc_offset - end_position = tok_end_position - doc_start + doc_offset - - if is_training and span_is_impossible: - start_position = 0 - end_position = 0 - - if example_index < 20: - tf.logging.info("*** Example ***") - tf.logging.info("unique_id: %s" % (unique_id)) - tf.logging.info("example_index: %s" % (example_index)) - tf.logging.info("doc_span_index: %s" % (doc_span_index)) - tf.logging.info("tok_start_to_orig_index: %s" % " ".join( - [str(x) for x in cur_tok_start_to_orig_index])) - tf.logging.info("tok_end_to_orig_index: %s" % " ".join( - [str(x) for x in cur_tok_end_to_orig_index])) - tf.logging.info("token_is_max_context: %s" % " ".join([ - "%d:%s" % (x, y) for (x, y) in six.iteritems(token_is_max_context) - ])) - tf.logging.info("input_pieces: %s" % " ".join( - [tokenizer.sp_model.IdToPiece(x) for x in tokens])) - tf.logging.info("input_ids: %s" % " ".join([str(x) for x in input_ids])) - tf.logging.info( - "input_mask: %s" % " ".join([str(x) for x in input_mask])) - tf.logging.info( - "segment_ids: %s" % " ".join([str(x) for x in segment_ids])) - - if is_training and span_is_impossible: - tf.logging.info("impossible example span") - - if is_training and not span_is_impossible: - pieces = [tokenizer.sp_model.IdToPiece(token) for token in - tokens[start_position: (end_position + 1)]] - answer_text = tokenizer.sp_model.DecodePieces(pieces) - tf.logging.info("start_position: %d" % (start_position)) - tf.logging.info("end_position: %d" % (end_position)) - tf.logging.info( - "answer: %s" % (tokenization.printable_text(answer_text))) - - # note(zhiliny): With multi processing, - # the example_index is actually the index within the current process - # therefore we use example_index=None to avoid being used in the future. - # The current code does not use example_index of training data. - if is_training: - feat_example_index = None - else: - feat_example_index = example_index - - feature = InputFeatures( - unique_id=unique_id, - example_index=feat_example_index, - doc_span_index=doc_span_index, - tok_start_to_orig_index=cur_tok_start_to_orig_index, - tok_end_to_orig_index=cur_tok_end_to_orig_index, - token_is_max_context=token_is_max_context, - tokens=[tokenizer.sp_model.IdToPiece(x) for x in tokens], - input_ids=input_ids, - input_mask=input_mask, - segment_ids=segment_ids, - paragraph_len=paragraph_len, - start_position=start_position, - end_position=end_position, - is_impossible=span_is_impossible) - - # Run callback - output_fn(feature) - - unique_id += 1 - if span_is_impossible: - cnt_neg += 1 - else: - cnt_pos += 1 - - tf.logging.info("Total number of instances: {} = pos {} neg {}".format( - cnt_pos + cnt_neg, cnt_pos, cnt_neg)) - - -def _check_is_max_context(doc_spans, cur_span_index, position): - """Check if this is the 'max context' doc span for the token.""" - - # Because of the sliding window approach taken to scoring documents, a single - # token can appear in multiple documents. E.g. - # Doc: the man went to the store and bought a gallon of milk - # Span A: the man went to the - # Span B: to the store and bought - # Span C: and bought a gallon of - # ... - # - # Now the word 'bought' will have two scores from spans B and C. We only - # want to consider the score with "maximum context", which we define as - # the *minimum* of its left and right context (the *sum* of left and - # right context will always be the same, of course). - # - # In the example the maximum context for 'bought' would be span C since - # it has 1 left context and 3 right context, while span B has 4 left context - # and 0 right context. - best_score = None - best_span_index = None - for (span_index, doc_span) in enumerate(doc_spans): - end = doc_span.start + doc_span.length - 1 - if position < doc_span.start: - continue - if position > end: - continue - num_left_context = position - doc_span.start - num_right_context = end - position - score = min(num_left_context, num_right_context) + 0.01 * doc_span.length - if best_score is None or score > best_score: - best_score = score - best_span_index = span_index - - return cur_span_index == best_span_index - - -def create_model(albert_config, is_training, input_ids, input_mask, segment_ids, - use_one_hot_embeddings): - """Creates a classification model.""" - model = modeling.AlbertModel( - config=albert_config, - is_training=is_training, - input_ids=input_ids, - input_mask=input_mask, - token_type_ids=segment_ids, - use_one_hot_embeddings=use_one_hot_embeddings) - - final_hidden = model.get_sequence_output() - - final_hidden_shape = modeling.get_shape_list(final_hidden, expected_rank=3) - batch_size = final_hidden_shape[0] - seq_length = final_hidden_shape[1] - hidden_size = final_hidden_shape[2] - - output_weights = tf.get_variable( - "cls/squad/output_weights", [2, hidden_size], - initializer=tf.truncated_normal_initializer(stddev=0.02)) - - output_bias = tf.get_variable( - "cls/squad/output_bias", [2], initializer=tf.zeros_initializer()) - - final_hidden_matrix = tf.reshape(final_hidden, - [batch_size * seq_length, hidden_size]) - logits = tf.matmul(final_hidden_matrix, output_weights, transpose_b=True) - logits = tf.nn.bias_add(logits, output_bias) - - logits = tf.reshape(logits, [batch_size, seq_length, 2]) - logits = tf.transpose(logits, [2, 0, 1]) - - unstacked_logits = tf.unstack(logits, axis=0) - - (start_logits, end_logits) = (unstacked_logits[0], unstacked_logits[1]) - - return (start_logits, end_logits) - - -def model_fn_builder(albert_config, init_checkpoint, learning_rate, - num_train_steps, num_warmup_steps, use_tpu, - use_one_hot_embeddings): - """Returns `model_fn` closure for TPUEstimator.""" - - def model_fn(features, labels, mode, params): # pylint: disable=unused-argument - """The `model_fn` for TPUEstimator.""" - - tf.logging.info("*** Features ***") - for name in sorted(features.keys()): - tf.logging.info(" name = %s, shape = %s" % (name, features[name].shape)) - - unique_ids = features["unique_ids"] - input_ids = features["input_ids"] - input_mask = features["input_mask"] - segment_ids = features["segment_ids"] - - is_training = (mode == tf.estimator.ModeKeys.TRAIN) - - (start_logits, end_logits) = create_model( - albert_config=albert_config, - is_training=is_training, - input_ids=input_ids, - input_mask=input_mask, - segment_ids=segment_ids, - use_one_hot_embeddings=use_one_hot_embeddings) - - tvars = tf.trainable_variables() - - initialized_variable_names = {} - scaffold_fn = None - if init_checkpoint: - (assignment_map, initialized_variable_names - ) = modeling.get_assignment_map_from_checkpoint(tvars, init_checkpoint) - if use_tpu: - - def tpu_scaffold(): - tf.train.init_from_checkpoint(init_checkpoint, assignment_map) - return tf.train.Scaffold() - - scaffold_fn = tpu_scaffold - else: - tf.train.init_from_checkpoint(init_checkpoint, assignment_map) - - tf.logging.info("**** Trainable Variables ****") - for var in tvars: - init_string = "" - if var.name in initialized_variable_names: - init_string = ", *INIT_FROM_CKPT*" - tf.logging.info(" name = %s, shape = %s%s", var.name, var.shape, - init_string) - - output_spec = None - if mode == tf.estimator.ModeKeys.TRAIN: - seq_length = modeling.get_shape_list(input_ids)[1] - - def compute_loss(logits, positions): - one_hot_positions = tf.one_hot( - positions, depth=seq_length, dtype=tf.float32) - log_probs = tf.nn.log_softmax(logits, axis=-1) - loss = -tf.reduce_mean( - tf.reduce_sum(one_hot_positions * log_probs, axis=-1)) - return loss - - start_positions = features["start_positions"] - end_positions = features["end_positions"] - - start_loss = compute_loss(start_logits, start_positions) - end_loss = compute_loss(end_logits, end_positions) - - total_loss = (start_loss + end_loss) / 2.0 - - train_op = optimization.create_optimizer( - total_loss, learning_rate, num_train_steps, num_warmup_steps, use_tpu) - - output_spec = contrib_tpu.TPUEstimatorSpec( - mode=mode, - loss=total_loss, - train_op=train_op, - scaffold_fn=scaffold_fn) - elif mode == tf.estimator.ModeKeys.PREDICT: - predictions = { - "unique_ids": unique_ids, - "start_logits": start_logits, - "end_logits": end_logits, - } - output_spec = contrib_tpu.TPUEstimatorSpec( - mode=mode, predictions=predictions, scaffold_fn=scaffold_fn) - else: - raise ValueError( - "Only TRAIN and PREDICT modes are supported: %s" % (mode)) - - return output_spec - - return model_fn - - -def input_fn_builder(input_file, seq_length, is_training, drop_remainder): - """Creates an `input_fn` closure to be passed to TPUEstimator.""" - - name_to_features = { - "unique_ids": tf.FixedLenFeature([], tf.int64), - "input_ids": tf.FixedLenFeature([seq_length], tf.int64), - "input_mask": tf.FixedLenFeature([seq_length], tf.int64), - "segment_ids": tf.FixedLenFeature([seq_length], tf.int64), - } - - if is_training: - name_to_features["start_positions"] = tf.FixedLenFeature([], tf.int64) - name_to_features["end_positions"] = tf.FixedLenFeature([], tf.int64) - - def _decode_record(record, name_to_features): - """Decodes a record to a TensorFlow example.""" - example = tf.parse_single_example(record, name_to_features) - - # tf.Example only supports tf.int64, but the TPU only supports tf.int32. - # So cast all int64 to int32. - for name in list(example.keys()): - t = example[name] - if t.dtype == tf.int64: - t = tf.to_int32(t) - example[name] = t - - return example - - def input_fn(params): - """The actual input function.""" - batch_size = params["batch_size"] - - # For training, we want a lot of parallel reading and shuffling. - # For eval, we want no shuffling and parallel reading doesn't matter. - d = tf.data.TFRecordDataset(input_file) - if is_training: - d = d.repeat() - d = d.shuffle(buffer_size=100) - - d = d.apply( - contrib_data.map_and_batch( - lambda record: _decode_record(record, name_to_features), - batch_size=batch_size, - drop_remainder=drop_remainder)) - - return d - - return input_fn - - -RawResult = collections.namedtuple("RawResult", - ["unique_id", "start_logits", "end_logits"]) - - -def write_predictions(all_examples, all_features, all_results, n_best_size, - max_answer_length, do_lower_case, output_prediction_file, - output_nbest_file, output_null_log_odds_file): - """Write final predictions to the json file and log-odds of null if needed.""" - tf.logging.info("Writing predictions to: %s" % (output_prediction_file)) - tf.logging.info("Writing nbest to: %s" % (output_nbest_file)) - - example_index_to_features = collections.defaultdict(list) - for feature in all_features: - example_index_to_features[feature.example_index].append(feature) - - unique_id_to_result = {} - for result in all_results: - unique_id_to_result[result.unique_id] = result - - _PrelimPrediction = collections.namedtuple( # pylint: disable=invalid-name - "PrelimPrediction", - ["feature_index", "start_index", "end_index", "start_logit", "end_logit"]) - - all_predictions = collections.OrderedDict() - all_nbest_json = collections.OrderedDict() - scores_diff_json = collections.OrderedDict() - - for (example_index, example) in enumerate(all_examples): - features = example_index_to_features[example_index] - - prelim_predictions = [] - # keep track of the minimum score of null start+end of position 0 - score_null = 1000000 # large and positive - min_null_feature_index = 0 # the paragraph slice with min mull score - null_start_logit = 0 # the start logit at the slice with min null score - null_end_logit = 0 # the end logit at the slice with min null score - for (feature_index, feature) in enumerate(features): - result = unique_id_to_result[feature.unique_id] - start_indexes = _get_best_indexes(result.start_logits, n_best_size) - end_indexes = _get_best_indexes(result.end_logits, n_best_size) - # if we could have irrelevant answers, get the min score of irrelevant - if FLAGS.version_2_with_negative: - feature_null_score = result.start_logits[0] + result.end_logits[0] - if feature_null_score < score_null: - score_null = feature_null_score - min_null_feature_index = feature_index - null_start_logit = result.start_logits[0] - null_end_logit = result.end_logits[0] - for start_index in start_indexes: - for end_index in end_indexes: - doc_offset = feature.tokens.index("[SEP]") + 1 - # We could hypothetically create invalid predictions, e.g., predict - # that the start of the span is in the question. We throw out all - # invalid predictions. - if start_index - doc_offset >= len(feature.tok_start_to_orig_index): - continue - if end_index - doc_offset >= len(feature.tok_end_to_orig_index): - continue - # if start_index not in feature.tok_start_to_orig_index: - # continue - # if end_index not in feature.tok_end_to_orig_index: - # continue - if not feature.token_is_max_context.get(start_index, False): - continue - if end_index < start_index: - continue - length = end_index - start_index + 1 - if length > max_answer_length: - continue - prelim_predictions.append( - _PrelimPrediction( - feature_index=feature_index, - start_index=start_index - doc_offset, - end_index=end_index - doc_offset, - start_logit=result.start_logits[start_index], - end_logit=result.end_logits[end_index])) - - if FLAGS.version_2_with_negative: - prelim_predictions.append( - _PrelimPrediction( - feature_index=min_null_feature_index, - start_index=-1, - end_index=-1, - start_logit=null_start_logit, - end_logit=null_end_logit)) - prelim_predictions = sorted( - prelim_predictions, - key=lambda x: (x.start_logit + x.end_logit), - reverse=True) - - _NbestPrediction = collections.namedtuple( # pylint: disable=invalid-name - "NbestPrediction", ["text", "start_logit", "end_logit"]) - - seen_predictions = {} - nbest = [] - for pred in prelim_predictions: - if len(nbest) >= n_best_size: - break - feature = features[pred.feature_index] - if pred.start_index >= 0: # this is a non-null prediction - tok_start_to_orig_index = feature.tok_start_to_orig_index - tok_end_to_orig_index = feature.tok_end_to_orig_index - start_orig_pos = tok_start_to_orig_index[pred.start_index] - end_orig_pos = tok_end_to_orig_index[pred.end_index] - - paragraph_text = example.paragraph_text - final_text = paragraph_text[start_orig_pos: end_orig_pos + 1].strip() - if final_text in seen_predictions: - continue - - seen_predictions[final_text] = True - else: - final_text = "" - seen_predictions[final_text] = True - - nbest.append( - _NbestPrediction( - text=final_text, - start_logit=pred.start_logit, - end_logit=pred.end_logit)) - - # if we didn't inlude the empty option in the n-best, inlcude it - if FLAGS.version_2_with_negative: - if "" not in seen_predictions: - nbest.append( - _NbestPrediction( - text="", start_logit=null_start_logit, - end_logit=null_end_logit)) - # In very rare edge cases we could have no valid predictions. So we - # just create a nonce prediction in this case to avoid failure. - if not nbest: - nbest.append( - _NbestPrediction(text="empty", start_logit=0.0, end_logit=0.0)) - - assert len(nbest) >= 1 - - total_scores = [] - best_non_null_entry = None - for entry in nbest: - total_scores.append(entry.start_logit + entry.end_logit) - if not best_non_null_entry: - if entry.text: - best_non_null_entry = entry - - probs = _compute_softmax(total_scores) - - nbest_json = [] - for (i, entry) in enumerate(nbest): - output = collections.OrderedDict() - output["text"] = entry.text - output["probability"] = probs[i] - output["start_logit"] = entry.start_logit - output["end_logit"] = entry.end_logit - nbest_json.append(output) - - assert len(nbest_json) >= 1 - - if not FLAGS.version_2_with_negative: - all_predictions[example.qas_id] = nbest_json[0]["text"] - else: - # predict "" iff the null score - the score of best non-null > threshold - score_diff = score_null - best_non_null_entry.start_logit - ( - best_non_null_entry.end_logit) - scores_diff_json[example.qas_id] = score_diff - if score_diff > FLAGS.null_score_diff_threshold: - all_predictions[example.qas_id] = "" - else: - all_predictions[example.qas_id] = best_non_null_entry.text - - all_nbest_json[example.qas_id] = nbest_json - - with tf.gfile.GFile(output_prediction_file, "w") as writer: - writer.write(json.dumps(all_predictions, indent=4) + "\n") - - with tf.gfile.GFile(output_nbest_file, "w") as writer: - writer.write(json.dumps(all_nbest_json, indent=4) + "\n") - - if FLAGS.version_2_with_negative: - with tf.gfile.GFile(output_null_log_odds_file, "w") as writer: - writer.write(json.dumps(scores_diff_json, indent=4) + "\n") - - -def _get_best_indexes(logits, n_best_size): - """Get the n-best logits from a list.""" - index_and_score = sorted(enumerate(logits), key=lambda x: x[1], reverse=True) - - best_indexes = [] - for i in range(len(index_and_score)): - if i >= n_best_size: - break - best_indexes.append(index_and_score[i][0]) - return best_indexes - - -def _compute_softmax(scores): - """Compute softmax probability over raw logits.""" - if not scores: - return [] - - max_score = None - for score in scores: - if max_score is None or score > max_score: - max_score = score - - exp_scores = [] - total_sum = 0.0 - for score in scores: - x = math.exp(score - max_score) - exp_scores.append(x) - total_sum += x - - probs = [] - for score in exp_scores: - probs.append(score / total_sum) - return probs - - -class FeatureWriter(object): - """Writes InputFeature to TF example file.""" - - def __init__(self, filename, is_training): - self.filename = filename - self.is_training = is_training - self.num_features = 0 - self._writer = tf.python_io.TFRecordWriter(filename) - - def process_feature(self, feature): - """Write a InputFeature to the TFRecordWriter as a tf.train.Example.""" - self.num_features += 1 - - def create_int_feature(values): - feature = tf.train.Feature( - int64_list=tf.train.Int64List(value=list(values))) - return feature - - features = collections.OrderedDict() - features["unique_ids"] = create_int_feature([feature.unique_id]) - features["input_ids"] = create_int_feature(feature.input_ids) - features["input_mask"] = create_int_feature(feature.input_mask) - features["segment_ids"] = create_int_feature(feature.segment_ids) - - if self.is_training: - features["start_positions"] = create_int_feature([feature.start_position]) - features["end_positions"] = create_int_feature([feature.end_position]) - impossible = 0 - if feature.is_impossible: - impossible = 1 - features["is_impossible"] = create_int_feature([impossible]) - - tf_example = tf.train.Example(features=tf.train.Features(feature=features)) - self._writer.write(tf_example.SerializeToString()) - - def close(self): - self._writer.close() - - -def validate_flags_or_throw(albert_config): - """Validate the input FLAGS or throw an exception.""" - tokenization.validate_case_matches_checkpoint(FLAGS.do_lower_case, - FLAGS.init_checkpoint) - - if not FLAGS.do_train and not FLAGS.do_predict: - raise ValueError("At least one of `do_train` or `do_predict` must be True.") - - if FLAGS.do_train: - if not FLAGS.train_file: - raise ValueError( - "If `do_train` is True, then `train_file` must be specified.") - if FLAGS.do_predict: - if not FLAGS.predict_file: - raise ValueError( - "If `do_predict` is True, then `predict_file` must be specified.") - - if FLAGS.max_seq_length > albert_config.max_position_embeddings: - raise ValueError( - "Cannot use sequence length %d because the ALBERT model " - "was only trained up to sequence length %d" % - (FLAGS.max_seq_length, albert_config.max_position_embeddings)) - - if FLAGS.max_seq_length <= FLAGS.max_query_length + 3: - raise ValueError( - "The max_seq_length (%d) must be greater than max_query_length " - "(%d) + 3" % (FLAGS.max_seq_length, FLAGS.max_query_length)) - - -def main(_): - tf.logging.set_verbosity(tf.logging.INFO) - - albert_config = modeling.AlbertConfig.from_json_file(FLAGS.albert_config_file) - - validate_flags_or_throw(albert_config) - - tf.gfile.MakeDirs(FLAGS.output_dir) - - tokenizer = tokenization.FullTokenizer( - vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case, - spm_model_file=FLAGS.spm_model_file) - - tpu_cluster_resolver = None - if FLAGS.use_tpu and FLAGS.tpu_name: - tpu_cluster_resolver = contrib_cluster_resolver.TPUClusterResolver( - FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project) - - is_per_host = contrib_tpu.InputPipelineConfig.PER_HOST_V2 - run_config = contrib_tpu.RunConfig( - cluster=tpu_cluster_resolver, - master=FLAGS.master, - model_dir=FLAGS.output_dir, - save_checkpoints_steps=FLAGS.save_checkpoints_steps, - tpu_config=contrib_tpu.TPUConfig( - iterations_per_loop=FLAGS.iterations_per_loop, - num_shards=FLAGS.num_tpu_cores, - per_host_input_for_training=is_per_host)) - - train_examples = None - num_train_steps = None - num_warmup_steps = None - if FLAGS.do_train: - train_examples = read_squad_examples( - input_file=FLAGS.train_file, is_training=True) - num_train_steps = int( - len(train_examples) / FLAGS.train_batch_size * FLAGS.num_train_epochs) - num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion) - - # Pre-shuffle the input to avoid having to make a very large shuffle - # buffer in in the `input_fn`. - rng = random.Random(12345) - rng.shuffle(train_examples) - - model_fn = model_fn_builder( - albert_config=albert_config, - init_checkpoint=FLAGS.init_checkpoint, - learning_rate=FLAGS.learning_rate, - num_train_steps=num_train_steps, - num_warmup_steps=num_warmup_steps, - use_tpu=FLAGS.use_tpu, - use_one_hot_embeddings=FLAGS.use_tpu) - - # If TPU is not available, this will fall back to normal Estimator on CPU - # or GPU. - estimator = contrib_tpu.TPUEstimator( - use_tpu=FLAGS.use_tpu, - model_fn=model_fn, - config=run_config, - train_batch_size=FLAGS.train_batch_size, - predict_batch_size=FLAGS.predict_batch_size) - - if FLAGS.do_train: - # We write to a temporary file to avoid storing very large constant tensors - # in memory. - - - if not tf.gfile.Exists(FLAGS.train_feature_file): - train_writer = FeatureWriter( - filename=os.path.join(FLAGS.train_feature_file), is_training=True) - convert_examples_to_features( - examples=train_examples, - tokenizer=tokenizer, - max_seq_length=FLAGS.max_seq_length, - doc_stride=FLAGS.doc_stride, - max_query_length=FLAGS.max_query_length, - is_training=True, - output_fn=train_writer.process_feature) - train_writer.close() - - tf.logging.info("***** Running training *****") - tf.logging.info(" Num orig examples = %d", len(train_examples)) - # tf.logging.info(" Num split examples = %d", train_writer.num_features) - tf.logging.info(" Batch size = %d", FLAGS.train_batch_size) - tf.logging.info(" Num steps = %d", num_train_steps) - del train_examples - - train_input_fn = input_fn_builder( - input_file=FLAGS.train_feature_file, - seq_length=FLAGS.max_seq_length, - is_training=True, - drop_remainder=True) - estimator.train(input_fn=train_input_fn, max_steps=num_train_steps) - - if FLAGS.do_predict: - eval_examples = read_squad_examples( - input_file=FLAGS.predict_file, is_training=False) - - if (tf.gfile.Exists(FLAGS.predict_feature_file) and tf.gfile.Exists( - FLAGS.predict_feature_left_file)): - tf.logging.info("Loading eval features from {}".format( - FLAGS.predict_feature_left_file)) - with tf.gfile.Open(FLAGS.predict_feature_left_file, 'rb') as fin: - eval_features = pickle.load(fin) - else: - eval_writer = FeatureWriter( - filename=FLAGS.predict_feature_file, is_training=False) - eval_features = [] - - def append_feature(feature): - eval_features.append(feature) - eval_writer.process_feature(feature) - - convert_examples_to_features( - examples=eval_examples, - tokenizer=tokenizer, - max_seq_length=FLAGS.max_seq_length, - doc_stride=FLAGS.doc_stride, - max_query_length=FLAGS.max_query_length, - is_training=False, - output_fn=append_feature) - eval_writer.close() - - with tf.gfile.Open(FLAGS.predict_feature_left_file, 'wb') as fout: - pickle.dump(eval_features, fout) - - tf.logging.info("***** Running predictions *****") - tf.logging.info(" Num orig examples = %d", len(eval_examples)) - tf.logging.info(" Num split examples = %d", len(eval_features)) - tf.logging.info(" Batch size = %d", FLAGS.predict_batch_size) - - all_results = [] - - predict_input_fn = input_fn_builder( - input_file=FLAGS.predict_feature_file, - seq_length=FLAGS.max_seq_length, - is_training=False, - drop_remainder=False) - - # If running eval on the TPU, you will need to specify the number of - # steps. - all_results = [] - for result in estimator.predict( - predict_input_fn, yield_single_examples=True): - if len(all_results) % 1000 == 0: - tf.logging.info("Processing example: %d" % (len(all_results))) - unique_id = int(result["unique_ids"]) - start_logits = [float(x) for x in result["start_logits"].flat] - end_logits = [float(x) for x in result["end_logits"].flat] - all_results.append( - RawResult( - unique_id=unique_id, - start_logits=start_logits, - end_logits=end_logits)) - - output_prediction_file = os.path.join(FLAGS.output_dir, "predictions.json") - output_nbest_file = os.path.join(FLAGS.output_dir, "nbest_predictions.json") - output_null_log_odds_file = os.path.join(FLAGS.output_dir, "null_odds.json") - - write_predictions(eval_examples, eval_features, all_results, - FLAGS.n_best_size, FLAGS.max_answer_length, - FLAGS.do_lower_case, output_prediction_file, - output_nbest_file, output_null_log_odds_file) - - -if __name__ == "__main__": - flags.mark_flag_as_required("vocab_file") - flags.mark_flag_as_required("albert_config_file") - flags.mark_flag_as_required("output_dir") - tf.app.run() diff --git a/run_squad_v1.py b/run_squad_v1.py new file mode 100644 index 00000000..e0db5442 --- /dev/null +++ b/run_squad_v1.py @@ -0,0 +1,477 @@ +# coding=utf-8 +# Copyright 2018 The Google AI Team Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# Lint as: python2, python3 +"""Run ALBERT on SQuAD v1.1 using sentence piece tokenization.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + + +import json +import os +import random +import time +import modeling +from garcon.albert import squad_utils +import tokenization +import six +import tensorflow as tf + +from tensorflow.contrib import cluster_resolver as contrib_cluster_resolver +from tensorflow.contrib import tpu as contrib_tpu + + +# pylint: disable=g-import-not-at-top +if six.PY2: + import six.moves.cPickle as pickle +else: + import pickle +# pylint: enable=g-import-not-at-top + +flags = tf.flags + +FLAGS = flags.FLAGS + +## Required parameters +flags.DEFINE_string( + "albert_config_file", None, + "The config json file corresponding to the pre-trained BERT model. " + "This specifies the model architecture.") + +flags.DEFINE_string("vocab_file", None, + "The vocabulary file that the BERT model was trained on.") + +flags.DEFINE_string("spm_model_file", None, + "The model file for sentence piece tokenization.") + +flags.DEFINE_string( + "output_dir", None, + "The output directory where the model checkpoints will be written.") + +## Other parameters +flags.DEFINE_string("train_file", None, + "SQuAD json for training. E.g., train-v1.1.json") + +flags.DEFINE_string( + "predict_file", None, + "SQuAD json for predictions. E.g., dev-v1.1.json or test-v1.1.json") + +flags.DEFINE_string("train_feature_file", None, + "training feature file.") + +flags.DEFINE_string( + "predict_feature_file", None, + "predict feature file.") + +flags.DEFINE_string( + "predict_feature_left_file", None, + "predict data kept but not pass to tpu.") + +flags.DEFINE_string( + "init_checkpoint", None, + "Initial checkpoint (usually from a pre-trained BERT model).") + +flags.DEFINE_bool( + "do_lower_case", True, + "Whether to lower case the input text. Should be True for uncased " + "models and False for cased models.") + +flags.DEFINE_integer( + "max_seq_length", 384, + "The maximum total input sequence length after WordPiece tokenization. " + "Sequences longer than this will be truncated, and sequences shorter " + "than this will be padded.") + +flags.DEFINE_integer( + "doc_stride", 128, + "When splitting up a long document into chunks, how much stride to " + "take between chunks.") + +flags.DEFINE_integer( + "max_query_length", 64, + "The maximum number of tokens for the question. Questions longer than " + "this will be truncated to this length.") + +flags.DEFINE_bool("do_train", False, "Whether to run training.") + +flags.DEFINE_bool("do_predict", False, "Whether to run eval on the dev set.") + +flags.DEFINE_integer("train_batch_size", 32, "Total batch size for training.") + +flags.DEFINE_integer("predict_batch_size", 8, + "Total batch size for predictions.") + +flags.DEFINE_float("learning_rate", 5e-5, "The initial learning rate for Adam.") + +flags.DEFINE_float("num_train_epochs", 3.0, + "Total number of training epochs to perform.") + +flags.DEFINE_float( + "warmup_proportion", 0.1, + "Proportion of training to perform linear learning rate warmup for. " + "E.g., 0.1 = 10% of training.") + +flags.DEFINE_integer("save_checkpoints_steps", 1000, + "How often to save the model checkpoint.") + +flags.DEFINE_integer("iterations_per_loop", 1000, + "How many steps to make in each estimator call.") + +flags.DEFINE_integer( + "n_best_size", 20, + "The total number of n-best predictions to generate in the " + "nbest_predictions.json output file.") + +flags.DEFINE_integer( + "max_answer_length", 30, + "The maximum length of an answer that can be generated. This is needed " + "because the start and end predictions are not conditioned on one another.") + +flags.DEFINE_bool("use_tpu", False, "Whether to use TPU or GPU/CPU.") + +tf.flags.DEFINE_string( + "tpu_name", None, + "The Cloud TPU to use for training. This should be either the name " + "used when creating the Cloud TPU, or a grpc://ip.address.of.tpu:8470 " + "url.") + +tf.flags.DEFINE_string( + "tpu_zone", None, + "[Optional] GCE zone where the Cloud TPU is located in. If not " + "specified, we will attempt to automatically detect the GCE project from " + "metadata.") + +tf.flags.DEFINE_string( + "gcp_project", None, + "[Optional] Project name for the Cloud TPU-enabled project. If not " + "specified, we will attempt to automatically detect the GCE project from " + "metadata.") + +tf.flags.DEFINE_string("master", None, "[Optional] TensorFlow master URL.") + +flags.DEFINE_integer( + "num_tpu_cores", 8, + "Only used if `use_tpu` is True. Total number of TPU cores to use.") + + +def validate_flags_or_throw(albert_config): + """Validate the input FLAGS or throw an exception.""" + tokenization.validate_case_matches_checkpoint(FLAGS.do_lower_case, + FLAGS.init_checkpoint) + + if not FLAGS.do_train and not FLAGS.do_predict: + raise ValueError("At least one of `do_train` or `do_predict` must be True.") + + if FLAGS.do_train: + if not FLAGS.train_file: + raise ValueError( + "If `do_train` is True, then `train_file` must be specified.") + if FLAGS.do_predict: + if not FLAGS.predict_file: + raise ValueError( + "If `do_predict` is True, then `predict_file` must be specified.") + + if FLAGS.max_seq_length > albert_config.max_position_embeddings: + raise ValueError( + "Cannot use sequence length %d because the ALBERT model " + "was only trained up to sequence length %d" % + (FLAGS.max_seq_length, albert_config.max_position_embeddings)) + + if FLAGS.max_seq_length <= FLAGS.max_query_length + 3: + raise ValueError( + "The max_seq_length (%d) must be greater than max_query_length " + "(%d) + 3" % (FLAGS.max_seq_length, FLAGS.max_query_length)) + + +def main(_): + tf.logging.set_verbosity(tf.logging.INFO) + + albert_config = modeling.AlbertConfig.from_json_file(FLAGS.albert_config_file) + + validate_flags_or_throw(albert_config) + + tf.gfile.MakeDirs(FLAGS.output_dir) + + tokenizer = tokenization.FullTokenizer( + vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case, + spm_model_file=FLAGS.spm_model_file) + + tpu_cluster_resolver = None + if FLAGS.use_tpu and FLAGS.tpu_name: + tpu_cluster_resolver = contrib_cluster_resolver.TPUClusterResolver( + FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project) + + is_per_host = contrib_tpu.InputPipelineConfig.PER_HOST_V2 + if FLAGS.do_train: + iterations_per_loop = int(min(FLAGS.iterations_per_loop, + FLAGS.save_checkpoints_steps)) + else: + iterations_per_loop = FLAGS.iterations_per_loop + run_config = contrib_tpu.RunConfig( + cluster=tpu_cluster_resolver, + master=FLAGS.master, + model_dir=FLAGS.output_dir, + save_checkpoints_steps=FLAGS.save_checkpoints_steps, + tpu_config=contrib_tpu.TPUConfig( + iterations_per_loop=iterations_per_loop, + num_shards=FLAGS.num_tpu_cores, + per_host_input_for_training=is_per_host)) + + train_examples = None + num_train_steps = None + num_warmup_steps = None + train_examples = squad_utils.read_squad_examples( + input_file=FLAGS.train_file, is_training=True) + num_train_steps = int( + len(train_examples) / FLAGS.train_batch_size * FLAGS.num_train_epochs) + if FLAGS.do_train: + num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion) + + # Pre-shuffle the input to avoid having to make a very large shuffle + # buffer in in the `input_fn`. + rng = random.Random(12345) + rng.shuffle(train_examples) + + model_fn = squad_utils.v1_model_fn_builder( + albert_config=albert_config, + init_checkpoint=FLAGS.init_checkpoint, + learning_rate=FLAGS.learning_rate, + num_train_steps=num_train_steps, + num_warmup_steps=num_warmup_steps, + use_tpu=FLAGS.use_tpu, + use_one_hot_embeddings=FLAGS.use_tpu) + + # If TPU is not available, this will fall back to normal Estimator on CPU + # or GPU. + estimator = contrib_tpu.TPUEstimator( + use_tpu=FLAGS.use_tpu, + model_fn=model_fn, + config=run_config, + train_batch_size=FLAGS.train_batch_size, + predict_batch_size=FLAGS.predict_batch_size) + + if FLAGS.do_train: + # We write to a temporary file to avoid storing very large constant tensors + # in memory. + + if not tf.gfile.Exists(FLAGS.train_feature_file): + train_writer = squad_utils.FeatureWriter( + filename=os.path.join(FLAGS.train_feature_file), is_training=True) + squad_utils.convert_examples_to_features( + examples=train_examples, + tokenizer=tokenizer, + max_seq_length=FLAGS.max_seq_length, + doc_stride=FLAGS.doc_stride, + max_query_length=FLAGS.max_query_length, + is_training=True, + output_fn=train_writer.process_feature, + do_lower_case=FLAGS.do_lower_case) + train_writer.close() + + tf.logging.info("***** Running training *****") + tf.logging.info(" Num orig examples = %d", len(train_examples)) + # tf.logging.info(" Num split examples = %d", train_writer.num_features) + tf.logging.info(" Batch size = %d", FLAGS.train_batch_size) + tf.logging.info(" Num steps = %d", num_train_steps) + del train_examples + + train_input_fn = squad_utils.input_fn_builder( + input_file=FLAGS.train_feature_file, + seq_length=FLAGS.max_seq_length, + is_training=True, + drop_remainder=True, + use_tpu=FLAGS.use_tpu, + bsz=FLAGS.train_batch_size, + is_v2=False) + estimator.train(input_fn=train_input_fn, max_steps=num_train_steps) + + if FLAGS.do_predict: + with tf.gfile.Open(FLAGS.predict_file) as predict_file: + prediction_json = json.load(predict_file)["data"] + + eval_examples = squad_utils.read_squad_examples( + input_file=FLAGS.predict_file, is_training=False) + + if (tf.gfile.Exists(FLAGS.predict_feature_file) and tf.gfile.Exists( + FLAGS.predict_feature_left_file)): + tf.logging.info("Loading eval features from {}".format( + FLAGS.predict_feature_left_file)) + with tf.gfile.Open(FLAGS.predict_feature_left_file, "rb") as fin: + eval_features = pickle.load(fin) + else: + eval_writer = squad_utils.FeatureWriter( + filename=FLAGS.predict_feature_file, is_training=False) + eval_features = [] + + def append_feature(feature): + eval_features.append(feature) + eval_writer.process_feature(feature) + + squad_utils.convert_examples_to_features( + examples=eval_examples, + tokenizer=tokenizer, + max_seq_length=FLAGS.max_seq_length, + doc_stride=FLAGS.doc_stride, + max_query_length=FLAGS.max_query_length, + is_training=False, + output_fn=append_feature, + do_lower_case=FLAGS.do_lower_case) + eval_writer.close() + + with tf.gfile.Open(FLAGS.predict_feature_left_file, "wb") as fout: + pickle.dump(eval_features, fout) + + tf.logging.info("***** Running predictions *****") + tf.logging.info(" Num orig examples = %d", len(eval_examples)) + tf.logging.info(" Num split examples = %d", len(eval_features)) + tf.logging.info(" Batch size = %d", FLAGS.predict_batch_size) + + predict_input_fn = squad_utils.input_fn_builder( + input_file=FLAGS.predict_feature_file, + seq_length=FLAGS.max_seq_length, + is_training=False, + drop_remainder=False, + use_tpu=FLAGS.use_tpu, + bsz=FLAGS.predict_batch_size, + is_v2=False) + + def get_result(checkpoint): + """Evaluate the checkpoint on SQuAD 1.0.""" + # If running eval on the TPU, you will need to specify the number of + # steps. + reader = tf.train.NewCheckpointReader(checkpoint) + global_step = reader.get_tensor(tf.GraphKeys.GLOBAL_STEP) + all_results = [] + for result in estimator.predict( + predict_input_fn, yield_single_examples=True, + checkpoint_path=checkpoint): + if len(all_results) % 1000 == 0: + tf.logging.info("Processing example: %d" % (len(all_results))) + unique_id = int(result["unique_ids"]) + start_log_prob = [float(x) for x in result["start_log_prob"].flat] + end_log_prob = [float(x) for x in result["end_log_prob"].flat] + all_results.append( + squad_utils.RawResult( + unique_id=unique_id, + start_log_prob=start_log_prob, + end_log_prob=end_log_prob)) + + output_prediction_file = os.path.join( + FLAGS.output_dir, "predictions.json") + output_nbest_file = os.path.join( + FLAGS.output_dir, "nbest_predictions.json") + + result_dict = {} + squad_utils.accumulate_predictions_v1( + result_dict, eval_examples, eval_features, + all_results, FLAGS.n_best_size, FLAGS.max_answer_length) + predictions = squad_utils.write_predictions_v1( + result_dict, eval_examples, eval_features, all_results, + FLAGS.n_best_size, FLAGS.max_answer_length, + output_prediction_file, output_nbest_file) + + return squad_utils.evaluate_v1( + prediction_json, predictions), int(global_step) + + def _find_valid_cands(curr_step): + filenames = tf.gfile.ListDirectory(FLAGS.output_dir) + candidates = [] + for filename in filenames: + if filename.endswith(".index"): + ckpt_name = filename[:-6] + idx = ckpt_name.split("-")[-1] + if idx != "best" and int(idx) > curr_step: + candidates.append(filename) + return candidates + + output_eval_file = os.path.join(FLAGS.output_dir, "eval_results.txt") + checkpoint_path = os.path.join(FLAGS.output_dir, "model.ckpt-best") + key_name = "f1" + writer = tf.gfile.GFile(output_eval_file, "w") + if tf.gfile.Exists(checkpoint_path + ".index"): + result = get_result(checkpoint_path) + best_perf = result[0][key_name] + global_step = result[1] + else: + global_step = -1 + best_perf = -1 + checkpoint_path = None + while global_step < num_train_steps: + steps_and_files = {} + filenames = tf.gfile.ListDirectory(FLAGS.output_dir) + for filename in filenames: + if filename.endswith(".index"): + ckpt_name = filename[:-6] + cur_filename = os.path.join(FLAGS.output_dir, ckpt_name) + if cur_filename.split("-")[-1] == "best": + continue + gstep = int(cur_filename.split("-")[-1]) + if gstep not in steps_and_files: + tf.logging.info("Add {} to eval list.".format(cur_filename)) + steps_and_files[gstep] = cur_filename + tf.logging.info("found {} files.".format(len(steps_and_files))) + if not steps_and_files: + tf.logging.info("found 0 file, global step: {}. Sleeping." + .format(global_step)) + time.sleep(60) + else: + for ele in sorted(steps_and_files.items()): + step, checkpoint_path = ele + if global_step >= step: + if len(_find_valid_cands(step)) > 1: + for ext in ["meta", "data-00000-of-00001", "index"]: + src_ckpt = checkpoint_path + ".{}".format(ext) + tf.logging.info("removing {}".format(src_ckpt)) + tf.gfile.Remove(src_ckpt) + continue + result, global_step = get_result(checkpoint_path) + tf.logging.info("***** Eval results *****") + for key in sorted(result.keys()): + tf.logging.info(" %s = %s", key, str(result[key])) + writer.write("%s = %s\n" % (key, str(result[key]))) + if result[key_name] > best_perf: + best_perf = result[key_name] + for ext in ["meta", "data-00000-of-00001", "index"]: + src_ckpt = checkpoint_path + ".{}".format(ext) + tgt_ckpt = checkpoint_path.rsplit( + "-", 1)[0] + "-best.{}".format(ext) + tf.logging.info("saving {} to {}".format(src_ckpt, tgt_ckpt)) + tf.gfile.Copy(src_ckpt, tgt_ckpt, overwrite=True) + writer.write("saved {} to {}\n".format(src_ckpt, tgt_ckpt)) + writer.write("best {} = {}\n".format(key_name, best_perf)) + tf.logging.info(" best {} = {}\n".format(key_name, best_perf)) + + if len(_find_valid_cands(global_step)) > 2: + for ext in ["meta", "data-00000-of-00001", "index"]: + src_ckpt = checkpoint_path + ".{}".format(ext) + tf.logging.info("removing {}".format(src_ckpt)) + tf.gfile.Remove(src_ckpt) + writer.write("=" * 50 + "\n") + + checkpoint_path = os.path.join(FLAGS.output_dir, "model.ckpt-best") + result, global_step = get_result(checkpoint_path) + tf.logging.info("***** Final Eval results *****") + for key in sorted(result.keys()): + tf.logging.info(" %s = %s", key, str(result[key])) + writer.write("%s = %s\n" % (key, str(result[key]))) + writer.write("best perf happened at step: {}".format(global_step)) + + +if __name__ == "__main__": + flags.mark_flag_as_required("vocab_file") + flags.mark_flag_as_required("albert_config_file") + flags.mark_flag_as_required("output_dir") + tf.app.run() diff --git a/run_squad_v2.py b/run_squad_v2.py new file mode 100644 index 00000000..27cfa130 --- /dev/null +++ b/run_squad_v2.py @@ -0,0 +1,500 @@ +# coding=utf-8 +# Copyright 2018 The Google AI Team Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# Lint as: python2, python3 +"""Run ALBERT on SQuAD v2.0 using sentence piece tokenization.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + + +import json +import os +import random +import time + +import modeling +from garcon.albert import squad_utils +import tokenization +import six +import tensorflow as tf + +from tensorflow.contrib import cluster_resolver as contrib_cluster_resolver +from tensorflow.contrib import tpu as contrib_tpu + + +# pylint: disable=g-import-not-at-top +if six.PY2: + import six.moves.cPickle as pickle +else: + import pickle +# pylint: enable=g-import-not-at-top + +flags = tf.flags + +FLAGS = flags.FLAGS + +## Required parameters +flags.DEFINE_string( + "albert_config_file", None, + "The config json file corresponding to the pre-trained ALBERT model. " + "This specifies the model architecture.") + +flags.DEFINE_string("vocab_file", None, + "The vocabulary file that the ALBERT model was trained on.") + +flags.DEFINE_string("spm_model_file", None, + "The model file for sentence piece tokenization.") + +flags.DEFINE_string( + "output_dir", None, + "The output directory where the model checkpoints will be written.") + +## Other parameters +flags.DEFINE_string("train_file", None, + "SQuAD json for training. E.g., train-v1.1.json") + +flags.DEFINE_string( + "predict_file", None, + "SQuAD json for predictions. E.g., dev-v1.1.json or test-v1.1.json") + +flags.DEFINE_string("train_feature_file", None, + "training feature file.") + +flags.DEFINE_string( + "predict_feature_file", None, + "predict feature file.") + +flags.DEFINE_string( + "predict_feature_left_file", None, + "predict data kept but not pass to tpu.") + +flags.DEFINE_string( + "init_checkpoint", None, + "Initial checkpoint (usually from a pre-trained BERT model).") + +flags.DEFINE_bool( + "do_lower_case", True, + "Whether to lower case the input text. Should be True for uncased " + "models and False for cased models.") + +flags.DEFINE_integer( + "max_seq_length", 384, + "The maximum total input sequence length after WordPiece tokenization. " + "Sequences longer than this will be truncated, and sequences shorter " + "than this will be padded.") + +flags.DEFINE_integer( + "doc_stride", 128, + "When splitting up a long document into chunks, how much stride to " + "take between chunks.") + +flags.DEFINE_integer( + "max_query_length", 64, + "The maximum number of tokens for the question. Questions longer than " + "this will be truncated to this length.") + +flags.DEFINE_bool("do_train", False, "Whether to run training.") + +flags.DEFINE_bool("do_predict", False, "Whether to run eval on the dev set.") + +flags.DEFINE_integer("train_batch_size", 32, "Total batch size for training.") + +flags.DEFINE_integer("predict_batch_size", 8, + "Total batch size for predictions.") + +flags.DEFINE_float("learning_rate", 5e-5, "The initial learning rate for Adam.") + +flags.DEFINE_float("num_train_epochs", 3.0, + "Total number of training epochs to perform.") + +flags.DEFINE_float( + "warmup_proportion", 0.1, + "Proportion of training to perform linear learning rate warmup for. " + "E.g., 0.1 = 10% of training.") + +flags.DEFINE_integer("save_checkpoints_steps", 1000, + "How often to save the model checkpoint.") + +flags.DEFINE_integer("iterations_per_loop", 1000, + "How many steps to make in each estimator call.") + +flags.DEFINE_integer( + "n_best_size", 20, + "The total number of n-best predictions to generate in the " + "nbest_predictions.json output file.") + +flags.DEFINE_integer( + "max_answer_length", 30, + "The maximum length of an answer that can be generated. This is needed " + "because the start and end predictions are not conditioned on one another.") + +flags.DEFINE_bool("use_tpu", False, "Whether to use TPU or GPU/CPU.") + +tf.flags.DEFINE_string( + "tpu_name", None, + "The Cloud TPU to use for training. This should be either the name " + "used when creating the Cloud TPU, or a grpc://ip.address.of.tpu:8470 " + "url.") + +tf.flags.DEFINE_string( + "tpu_zone", None, + "[Optional] GCE zone where the Cloud TPU is located in. If not " + "specified, we will attempt to automatically detect the GCE project from " + "metadata.") + +tf.flags.DEFINE_string( + "gcp_project", None, + "[Optional] Project name for the Cloud TPU-enabled project. If not " + "specified, we will attempt to automatically detect the GCE project from " + "metadata.") + +tf.flags.DEFINE_string("master", None, "[Optional] TensorFlow master URL.") + +flags.DEFINE_integer( + "num_tpu_cores", 8, + "Only used if `use_tpu` is True. Total number of TPU cores to use.") + + +flags.DEFINE_integer("start_n_top", 5, "beam size for the start positions.") + +flags.DEFINE_integer("end_n_top", 5, "beam size for the end positions.") + +flags.DEFINE_float("dropout_prob", 0.1, "dropout probability.") + + +def validate_flags_or_throw(albert_config): + """Validate the input FLAGS or throw an exception.""" + tokenization.validate_case_matches_checkpoint(FLAGS.do_lower_case, + FLAGS.init_checkpoint) + + if not FLAGS.do_train and not FLAGS.do_predict: + raise ValueError("At least one of `do_train` or `do_predict` must be True.") + + if FLAGS.do_train: + if not FLAGS.train_file: + raise ValueError( + "If `do_train` is True, then `train_file` must be specified.") + if FLAGS.do_predict: + if not FLAGS.predict_file: + raise ValueError( + "If `do_predict` is True, then `predict_file` must be specified.") + + if FLAGS.max_seq_length > albert_config.max_position_embeddings: + raise ValueError( + "Cannot use sequence length %d because the ALBERT model " + "was only trained up to sequence length %d" % + (FLAGS.max_seq_length, albert_config.max_position_embeddings)) + + if FLAGS.max_seq_length <= FLAGS.max_query_length + 3: + raise ValueError( + "The max_seq_length (%d) must be greater than max_query_length " + "(%d) + 3" % (FLAGS.max_seq_length, FLAGS.max_query_length)) + + +def main(_): + tf.logging.set_verbosity(tf.logging.INFO) + + albert_config = modeling.AlbertConfig.from_json_file(FLAGS.albert_config_file) + + validate_flags_or_throw(albert_config) + + tf.gfile.MakeDirs(FLAGS.output_dir) + + tokenizer = tokenization.FullTokenizer( + vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case, + spm_model_file=FLAGS.spm_model_file) + + tpu_cluster_resolver = None + if FLAGS.use_tpu and FLAGS.tpu_name: + tpu_cluster_resolver = contrib_cluster_resolver.TPUClusterResolver( + FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project) + + is_per_host = contrib_tpu.InputPipelineConfig.PER_HOST_V2 + if FLAGS.do_train: + iterations_per_loop = int(min(FLAGS.iterations_per_loop, + FLAGS.save_checkpoints_steps)) + else: + iterations_per_loop = FLAGS.iterations_per_loop + run_config = contrib_tpu.RunConfig( + cluster=tpu_cluster_resolver, + master=FLAGS.master, + model_dir=FLAGS.output_dir, + save_checkpoints_steps=FLAGS.save_checkpoints_steps, + tpu_config=contrib_tpu.TPUConfig( + iterations_per_loop=iterations_per_loop, + num_shards=FLAGS.num_tpu_cores, + per_host_input_for_training=is_per_host)) + + train_examples = None + num_train_steps = None + num_warmup_steps = None + train_examples = squad_utils.read_squad_examples( + input_file=FLAGS.train_file, is_training=True) + num_train_steps = int( + len(train_examples) / FLAGS.train_batch_size * FLAGS.num_train_epochs) + if FLAGS.do_train: + num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion) + + # Pre-shuffle the input to avoid having to make a very large shuffle + # buffer in in the `input_fn`. + rng = random.Random(12345) + rng.shuffle(train_examples) + + model_fn = squad_utils.v2_model_fn_builder( + albert_config=albert_config, + init_checkpoint=FLAGS.init_checkpoint, + learning_rate=FLAGS.learning_rate, + num_train_steps=num_train_steps, + num_warmup_steps=num_warmup_steps, + use_tpu=FLAGS.use_tpu, + use_one_hot_embeddings=FLAGS.use_tpu, + max_seq_length=FLAGS.max_seq_length, + start_n_top=FLAGS.start_n_top, + end_n_top=FLAGS.end_n_top, + dropout_prob=FLAGS.dropout_prob) + + # If TPU is not available, this will fall back to normal Estimator on CPU + # or GPU. + estimator = contrib_tpu.TPUEstimator( + use_tpu=FLAGS.use_tpu, + model_fn=model_fn, + config=run_config, + train_batch_size=FLAGS.train_batch_size, + predict_batch_size=FLAGS.predict_batch_size) + + if FLAGS.do_train: + # We write to a temporary file to avoid storing very large constant tensors + # in memory. + + if not tf.gfile.Exists(FLAGS.train_feature_file): + train_writer = squad_utils.FeatureWriter( + filename=os.path.join(FLAGS.train_feature_file), is_training=True) + squad_utils.convert_examples_to_features( + examples=train_examples, + tokenizer=tokenizer, + max_seq_length=FLAGS.max_seq_length, + doc_stride=FLAGS.doc_stride, + max_query_length=FLAGS.max_query_length, + is_training=True, + output_fn=train_writer.process_feature, + do_lower_case=FLAGS.do_lower_case) + train_writer.close() + + tf.logging.info("***** Running training *****") + tf.logging.info(" Num orig examples = %d", len(train_examples)) + # tf.logging.info(" Num split examples = %d", train_writer.num_features) + tf.logging.info(" Batch size = %d", FLAGS.train_batch_size) + tf.logging.info(" Num steps = %d", num_train_steps) + del train_examples + + train_input_fn = squad_utils.input_fn_builder( + input_file=FLAGS.train_feature_file, + seq_length=FLAGS.max_seq_length, + is_training=True, + drop_remainder=True, + use_tpu=FLAGS.use_tpu, + bsz=FLAGS.train_batch_size, + is_v2=True) + estimator.train(input_fn=train_input_fn, max_steps=num_train_steps) + + if FLAGS.do_predict: + with tf.gfile.Open(FLAGS.predict_file) as predict_file: + prediction_json = json.load(predict_file)["data"] + eval_examples = squad_utils.read_squad_examples( + input_file=FLAGS.predict_file, is_training=False) + + if (tf.gfile.Exists(FLAGS.predict_feature_file) and tf.gfile.Exists( + FLAGS.predict_feature_left_file)): + tf.logging.info("Loading eval features from {}".format( + FLAGS.predict_feature_left_file)) + with tf.gfile.Open(FLAGS.predict_feature_left_file, "rb") as fin: + eval_features = pickle.load(fin) + else: + eval_writer = squad_utils.FeatureWriter( + filename=FLAGS.predict_feature_file, is_training=False) + eval_features = [] + + def append_feature(feature): + eval_features.append(feature) + eval_writer.process_feature(feature) + + squad_utils.convert_examples_to_features( + examples=eval_examples, + tokenizer=tokenizer, + max_seq_length=FLAGS.max_seq_length, + doc_stride=FLAGS.doc_stride, + max_query_length=FLAGS.max_query_length, + is_training=False, + output_fn=append_feature, + do_lower_case=FLAGS.do_lower_case) + eval_writer.close() + + with tf.gfile.Open(FLAGS.predict_feature_left_file, "wb") as fout: + pickle.dump(eval_features, fout) + + tf.logging.info("***** Running predictions *****") + tf.logging.info(" Num orig examples = %d", len(eval_examples)) + tf.logging.info(" Num split examples = %d", len(eval_features)) + tf.logging.info(" Batch size = %d", FLAGS.predict_batch_size) + + predict_input_fn = squad_utils.input_fn_builder( + input_file=FLAGS.predict_feature_file, + seq_length=FLAGS.max_seq_length, + is_training=False, + drop_remainder=False, + use_tpu=FLAGS.use_tpu, + bsz=FLAGS.predict_batch_size, + is_v2=True) + + def get_result(checkpoint): + """Evaluate the checkpoint on SQuAD v2.0.""" + # If running eval on the TPU, you will need to specify the number of + # steps. + reader = tf.train.NewCheckpointReader(checkpoint) + global_step = reader.get_tensor(tf.GraphKeys.GLOBAL_STEP) + all_results = [] + for result in estimator.predict( + predict_input_fn, yield_single_examples=True, + checkpoint_path=checkpoint): + if len(all_results) % 1000 == 0: + tf.logging.info("Processing example: %d" % (len(all_results))) + unique_id = int(result["unique_ids"]) + start_top_log_probs = ( + [float(x) for x in result["start_top_log_probs"].flat]) + start_top_index = [int(x) for x in result["start_top_index"].flat] + end_top_log_probs = ( + [float(x) for x in result["end_top_log_probs"].flat]) + end_top_index = [int(x) for x in result["end_top_index"].flat] + + cls_logits = float(result["cls_logits"].flat[0]) + all_results.append( + squad_utils.RawResultV2( + unique_id=unique_id, + start_top_log_probs=start_top_log_probs, + start_top_index=start_top_index, + end_top_log_probs=end_top_log_probs, + end_top_index=end_top_index, + cls_logits=cls_logits)) + + output_prediction_file = os.path.join( + FLAGS.output_dir, "predictions.json") + output_nbest_file = os.path.join( + FLAGS.output_dir, "nbest_predictions.json") + output_null_log_odds_file = os.path.join( + FLAGS.output_dir, "null_odds.json") + + result_dict = {} + cls_dict = {} + squad_utils.accumulate_predictions_v2( + result_dict, cls_dict, eval_examples, eval_features, + all_results, FLAGS.n_best_size, FLAGS.max_answer_length, + FLAGS.start_n_top, FLAGS.end_n_top) + + return squad_utils.evaluate_v2( + result_dict, cls_dict, prediction_json, eval_examples, + eval_features, all_results, FLAGS.n_best_size, + FLAGS.max_answer_length, output_prediction_file, output_nbest_file, + output_null_log_odds_file), int(global_step) + + def _find_valid_cands(curr_step): + filenames = tf.gfile.ListDirectory(FLAGS.output_dir) + candidates = [] + for filename in filenames: + if filename.endswith(".index"): + ckpt_name = filename[:-6] + idx = ckpt_name.split("-")[-1] + if idx != "best" and int(idx) > curr_step: + candidates.append(filename) + return candidates + + output_eval_file = os.path.join(FLAGS.output_dir, "eval_results.txt") + checkpoint_path = os.path.join(FLAGS.output_dir, "model.ckpt-best") + key_name = "f1" + writer = tf.gfile.GFile(output_eval_file, "w") + if tf.gfile.Exists(checkpoint_path + ".index"): + result = get_result(checkpoint_path) + best_perf = result[0][key_name] + global_step = result[1] + else: + global_step = -1 + best_perf = -1 + checkpoint_path = None + while global_step < num_train_steps: + steps_and_files = {} + filenames = tf.gfile.ListDirectory(FLAGS.output_dir) + for filename in filenames: + if filename.endswith(".index"): + ckpt_name = filename[:-6] + cur_filename = os.path.join(FLAGS.output_dir, ckpt_name) + if cur_filename.split("-")[-1] == "best": + continue + gstep = int(cur_filename.split("-")[-1]) + if gstep not in steps_and_files: + tf.logging.info("Add {} to eval list.".format(cur_filename)) + steps_and_files[gstep] = cur_filename + tf.logging.info("found {} files.".format(len(steps_and_files))) + if not steps_and_files: + tf.logging.info("found 0 file, global step: {}. Sleeping." + .format(global_step)) + time.sleep(60) + else: + for ele in sorted(steps_and_files.items()): + step, checkpoint_path = ele + if global_step >= step: + if len(_find_valid_cands(step)) > 1: + for ext in ["meta", "data-00000-of-00001", "index"]: + src_ckpt = checkpoint_path + ".{}".format(ext) + tf.logging.info("removing {}".format(src_ckpt)) + tf.gfile.Remove(src_ckpt) + continue + result, global_step = get_result(checkpoint_path) + tf.logging.info("***** Eval results *****") + for key in sorted(result.keys()): + tf.logging.info(" %s = %s", key, str(result[key])) + writer.write("%s = %s\n" % (key, str(result[key]))) + if result[key_name] > best_perf: + best_perf = result[key_name] + for ext in ["meta", "data-00000-of-00001", "index"]: + src_ckpt = checkpoint_path + ".{}".format(ext) + tgt_ckpt = checkpoint_path.rsplit( + "-", 1)[0] + "-best.{}".format(ext) + tf.logging.info("saving {} to {}".format(src_ckpt, tgt_ckpt)) + tf.gfile.Copy(src_ckpt, tgt_ckpt, overwrite=True) + writer.write("saved {} to {}\n".format(src_ckpt, tgt_ckpt)) + writer.write("best {} = {}\n".format(key_name, best_perf)) + tf.logging.info(" best {} = {}\n".format(key_name, best_perf)) + + if len(_find_valid_cands(global_step)) > 2: + for ext in ["meta", "data-00000-of-00001", "index"]: + src_ckpt = checkpoint_path + ".{}".format(ext) + tf.logging.info("removing {}".format(src_ckpt)) + tf.gfile.Remove(src_ckpt) + writer.write("=" * 50 + "\n") + + checkpoint_path = os.path.join(FLAGS.output_dir, "model.ckpt-best") + result, global_step = get_result(checkpoint_path) + tf.logging.info("***** Final Eval results *****") + for key in sorted(result.keys()): + tf.logging.info(" %s = %s", key, str(result[key])) + writer.write("%s = %s\n" % (key, str(result[key]))) + writer.write("best perf happened at step: {}".format(global_step)) + + +if __name__ == "__main__": + flags.mark_flag_as_required("vocab_file") + flags.mark_flag_as_required("albert_config_file") + flags.mark_flag_as_required("output_dir") + tf.app.run() diff --git a/squad_utils.py b/squad_utils.py index 6f4ace84..10469b39 100644 --- a/squad_utils.py +++ b/squad_utils.py @@ -247,9 +247,10 @@ def convert_examples_to_features(examples, tokenizer, max_seq_length, tok_start_to_chartok_index = [] tok_end_to_chartok_index = [] char_cnt = 0 + para_tokens = [six.ensure_text(token, "utf-8") for token in para_tokens] for i, token in enumerate(para_tokens): - new_token = six.ensure_binary(token).replace( - tokenization.SPIECE_UNDERLINE, b" ") + new_token = six.ensure_text(token).replace( + tokenization.SPIECE_UNDERLINE.decode("utf-8"), " ") chartok_to_tok_index.extend([i] * len(new_token)) tok_start_to_chartok_index.append(char_cnt) char_cnt += len(new_token) @@ -836,7 +837,6 @@ def compute_loss(logits, positions): else: raise ValueError( "Only TRAIN and PREDICT modes are supported: %s" % (mode)) - return output_spec return model_fn @@ -1666,14 +1666,8 @@ def compute_loss(log_probs, positions): "end_top_log_probs": outputs["end_top_log_probs"], "cls_logits": outputs["cls_logits"] } - - if use_tpu: - output_spec = contrib_tpu.TPUEstimatorSpec( - mode=mode, predictions=predictions, scaffold_fn=scaffold_fn) - else: - output_spec = tf.estimator.EstimatorSpec( - mode=mode, predictions=predictions) - return output_spec + output_spec = contrib_tpu.TPUEstimatorSpec( + mode=mode, predictions=predictions, scaffold_fn=scaffold_fn) else: raise ValueError( "Only TRAIN and PREDICT modes are supported: %s" % (mode))