tree-wide: minor typo fixes

mangojiez · Oct 31, 2018 · d8014ef · d8014ef
1 parent d45d0fb
commit d8014ef
Show file tree

Hide file tree

Showing 5 changed files with 11 additions and 11 deletions.
diff --git a/create_pretraining_data.py b/create_pretraining_data.py
@@ -334,7 +334,7 @@ def create_instances_from_document(
 
 def create_masked_lm_predictions(tokens, masked_lm_prob,
  max_predictions_per_seq, vocab_words, rng):
- """Creates the predictis for the masked LM objective."""
+ """Creates the predictions for the masked LM objective."""
 
  cand_indexes = []
  for (i, token) in enumerate(tokens):

diff --git a/extract_features.py b/extract_features.py
@@ -57,7 +57,7 @@
 
 flags.DEFINE_bool(
  "do_lower_case", True,
- "Whethre to lower case the input text. Should be True for uncased "
+ "Whether to lower case the input text. Should be True for uncased "
  "models and False for cased models.")
 
 flags.DEFINE_integer("batch_size", 32, "Batch size for predictions.")
@@ -231,7 +231,7 @@ def convert_examples_to_features(examples, seq_length, tokenizer):
  # sequence or the second sequence. The embedding vectors for `type=0` and
  # `type=1` were learned during pre-training and are added to the wordpiece
  # embedding vector (and position vector). This is not *strictly* necessary
- # since the [SEP] token unambigiously separates the sequences, but it makes
+ # since the [SEP] token unambiguously separates the sequences, but it makes
  # it easier for the model to learn the concept of sequences.
  #
  # For classification tasks, the first vector (corresponding to [CLS]) is

diff --git a/modeling.py b/modeling.py
@@ -54,7 +54,7 @@ def __init__(self,
  layer in the Transformer encoder.
  hidden_act: The non-linear activation function (function or string) in the
  encoder and pooler.
- hidden_dropout_prob: The dropout probabilitiy for all fully connected
+ hidden_dropout_prob: The dropout probability for all fully connected
  layers in the embeddings, encoder, and pooler.
  attention_probs_dropout_prob: The dropout ratio for the attention
  probabilities.
@@ -63,7 +63,7 @@ def __init__(self,
  (e.g., 512 or 1024 or 2048).
  type_vocab_size: The vocabulary size of the `token_type_ids` passed into
  `BertModel`.
- initializer_range: The sttdev of the truncated_normal_initializer for
+ initializer_range: The stdev of the truncated_normal_initializer for
  initializing all weight matrices.
  """
  self.vocab_size = vocab_size
@@ -347,7 +347,7 @@ def dropout(input_tensor, dropout_prob):
 
  Args:
  input_tensor: float Tensor.
- dropout_prob: Python float. The probabiltiy of dropping out a value (NOT of
+ dropout_prob: Python float. The probability of dropping out a value (NOT of
  *keeping* a dimension as in `tf.nn.dropout`).
 
  Returns:
@@ -605,7 +605,7 @@ def attention_layer(from_tensor,
  attention_mask: (optional) int32 Tensor of shape [batch_size,
  from_seq_length, to_seq_length]. The values should be 1 or 0. The
  attention scores will effectively be set to -infinity for any positions in
- the mask that are 0, and will be unchaged for positions that are 1.
+ the mask that are 0, and will be unchanged for positions that are 1.
  num_attention_heads: int. Number of attention heads.
  size_per_head: int. Size of each attention head.
  query_act: (optional) Activation function for the query transform.
@@ -973,7 +973,7 @@ def assert_rank(tensor, expected_rank, name=None):
  name: Optional name of the tensor for the error message.
 
  Raises:
- ValueError: If the expected shape doesn"t match the actual shape.
+ ValueError: If the expected shape doesn't match the actual shape.
  """
  if name is None:
  name = tensor.name

diff --git a/run_classifier.py b/run_classifier.py
@@ -306,7 +306,7 @@ def convert_examples_to_features(examples, label_list, max_seq_length,
  # sequence or the second sequence. The embedding vectors for `type=0` and
  # `type=1` were learned during pre-training and are added to the wordpiece
  # embedding vector (and position vector). This is not *strictly* necessary
- # since the [SEP] token unambigiously separates the sequences, but it makes
+ # since the [SEP] token unambiguously separates the sequences, but it makes
  # it easier for the model to learn the concept of sequences.
  #
  # For classification tasks, the first vector (corresponding to [CLS]) is

diff --git a/run_pretraining.py b/run_pretraining.py
@@ -375,8 +375,8 @@ def input_fn(params):
  d = d.repeat()
 
  # We must `drop_remainder` on training because the TPU requires fixed
- # size dimensions. For eval, we assume we are evaling on the CPU or GPU
- # and we *don"t* want to drop the remainder, otherwise we wont cover
+ # size dimensions. For eval, we assume we are evaluating on the CPU or GPU
+ # and we *don't* want to drop the remainder, otherwise we wont cover
  # every sample.
  d = d.apply(
  tf.contrib.data.map_and_batch(