Merge commit for internal changes

pxoo · Mar 14, 2017 · be6c40a · be6c40a
2 parents 3549499 + f861a1e
commit be6c40a
Show file tree

Hide file tree

Showing 202 changed files with 6,942 additions and 2,082 deletions.
diff --git a/configure b/configure
@@ -41,8 +41,10 @@ function bazel_clean_and_fetch() {
   if ! is_windows; then
     bazel clean --expunge
   fi
-  bazel fetch "//tensorflow/... -//tensorflow/contrib/nccl/... \
-      -//tensorflow/examples/android/..."
+  if [ -z "$TF_BAZEL_TARGETS" ]; then
+    TF_BAZEL_TARGETS="//tensorflow/... -//tensorflow/contrib/nccl/... -//tensorflow/examples/android/..."
+  fi
+  bazel fetch "$TF_BAZEL_TARGETS"
 }
 
 function sed_hyphen_i() {

diff --git a/tensorflow/BUILD b/tensorflow/BUILD
@@ -168,6 +168,8 @@ filegroup(
         "//tensorflow/contrib:all_files",
         "//tensorflow/contrib/android:all_files",
         "//tensorflow/contrib/bayesflow:all_files",
+        "//tensorflow/contrib/cloud:all_files",
+        "//tensorflow/contrib/cloud/kernels:all_files",
         "//tensorflow/contrib/compiler:all_files",
         "//tensorflow/contrib/copy_graph:all_files",
         "//tensorflow/contrib/crf:all_files",
@@ -228,7 +230,6 @@ filegroup(
         "//tensorflow/core/grappler/inputs:all_files",
         "//tensorflow/core/grappler/optimizers:all_files",
         "//tensorflow/core/kernels:all_files",
-        "//tensorflow/core/kernels/cloud:all_files",
         "//tensorflow/core/kernels/hexagon:all_files",
         "//tensorflow/core/ops/compat:all_files",
         "//tensorflow/core/platform/cloud:all_files",

diff --git a/tensorflow/cc/BUILD b/tensorflow/cc/BUILD
@@ -375,6 +375,16 @@ tf_gen_op_wrappers_cc(
     visibility = ["//tensorflow:internal"],
 )
 
+tf_gen_op_wrappers_cc(
+    name = "resource_variable_ops",
+    include_internal_ops = 1,
+    op_lib_names = [
+        "resource_variable_ops",
+    ],
+    pkg = "//tensorflow/core",
+    visibility = ["//tensorflow:internal"],
+)
+
 tf_gen_op_wrappers_cc(
     name = "remote_fused_graph_ops",
     op_lib_names = [

diff --git a/tensorflow/compiler/tests/xla_test.py b/tensorflow/compiler/tests/xla_test.py
@@ -119,7 +119,11 @@ def test_scope(self):
       yield
 
 
-def Benchmark(tf_bench, builder_fn, use_xla_jit, device):
+def Benchmark(tf_bench,
+              builder_fn,
+              use_xla_jit,
+              device,
+              separate_compiled_gradients=False):
   """Build a graph and run benchmarks against it, with or without XLA.
 
   Args:
@@ -129,6 +133,14 @@ def Benchmark(tf_bench, builder_fn, use_xla_jit, device):
         is a list of tensors to fetch as output.
     use_xla_jit: If true compile with the XLA JIT, otherwise use regular TF.
     device: The tensorflow device to run on, e.g. "cpu", "gpu".
+    separate_compiled_gradients: If true put each gradient subgraph into a
+      separate compilation scope. This gives fine-grained control over which
+      portions of the graph will be compiled as a single unit. Compiling
+      gradients separately may yield better performance for some graphs.
+      The scope is named based on the scope of the forward computation as well
+      as the name of the gradients. As a result, the gradients will be compiled
+      in a scope that is separate from both the forward computation, and from
+      other gradients.
   """
 
   with ops.Graph().as_default():
@@ -137,7 +149,9 @@ def Benchmark(tf_bench, builder_fn, use_xla_jit, device):
     with ops.device(device):
       fetches = []
       jit_scope = jit.experimental_jit_scope
-      with jit_scope(compile_ops=use_xla_jit):
+      with jit_scope(
+          compile_ops=use_xla_jit,
+          separate_compiled_gradients=separate_compiled_gradients):
         name, fetches = builder_fn()
 
       # We only want to benchmark the operations themselves, and not the data

diff --git a/tensorflow/compiler/tf2xla/dump_graph.cc b/tensorflow/compiler/tf2xla/dump_graph.cc
@@ -33,8 +33,16 @@ struct NameCounts {
   std::unordered_map<string, int> counts;
 };
 
-string MakeUniquePath(const string& name) {
+string MakeUniquePath(string name) {
   static NameCounts& instance = *new NameCounts;
+
+  // Remove illegal characters from `name`.
+  for (int i = 0; i < name.size(); ++i) {
+    if (name[i] == '/') {
+      name[i] = '_';
+    }
+  }
+
   int count;
   {
     mutex_lock lock(instance.counts_mutex);

diff --git a/tensorflow/compiler/tf2xla/kernels/declaration_op.cc b/tensorflow/compiler/tf2xla/kernels/declaration_op.cc
@@ -104,9 +104,8 @@ class ArgOp : public XlaOpKernel {
     if (arg.is_variable) {
       // We use the argument position of the variable input as a unique ID.
       // TODO(phawkins): this code assumes that variables do not alias.
-      // TODO(b/32704451): Don't just ignore the ::tensorflow::Status object!
-      tc.CreateVariable(index_, arg.name, arg.value.type, arg.value.handle)
-          .IgnoreError();
+      OP_REQUIRES_OK(ctx, tc.CreateVariable(index_, arg.name, arg.value.type,
+                                            arg.value.handle));
       ctx->SetVariableOutput(0, index_);
     } else if (arg.value.is_constant) {
       ctx->SetConstantOutput(0, arg.value.constant_value);

diff --git a/tensorflow/compiler/tf2xla/xla_compiler.cc b/tensorflow/compiler/tf2xla/xla_compiler.cc
@@ -76,7 +76,8 @@ int64 XlaCompiler::NextStepId() {
 static void PruneUnreachableNodes(Graph* graph) {
   std::unordered_set<const Node*> nodes;
   for (Node* node : graph->nodes()) {
-    if (node->type_string() == "_Retval") {
+    if (node->type_string() == "_Retval" ||
+        StringPiece(node->type_string()).ends_with("Send")) {
       nodes.insert(node);
     }
   }
@@ -379,7 +380,6 @@ Status XlaCompiler::CompileGraph(string const& name,
   VLOG(1) << "Executing graph symbolically to populate ComputationBuilder.";
 
   xla::ComputationBuilder builder(client(), name);
-
   XlaContext* context =
       new XlaContext(this, &builder, options_.allow_cpu_custom_calls,
                      options_.resolve_compile_time_constants);

diff --git a/tensorflow/compiler/xla/service/BUILD b/tensorflow/compiler/xla/service/BUILD
@@ -125,6 +125,7 @@ cc_test(
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:test_helpers",
         "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/core:test_main",
     ],
 )

diff --git a/tensorflow/compiler/xla/service/buffer_assignment.cc b/tensorflow/compiler/xla/service/buffer_assignment.cc
@@ -425,7 +425,8 @@ Status GatherComputationsByAllocationType(
     }
 
     for (auto& instruction : computation->instructions()) {
-      for (auto* subcomputation : instruction->MakeCalledComputationsSet()) {
+      for (HloComputation* subcomputation :
+           instruction->called_computations()) {
         switch (instruction->opcode()) {
           case HloOpcode::kCall:
           case HloOpcode::kWhile:

diff --git a/tensorflow/compiler/xla/service/buffer_liveness.cc b/tensorflow/compiler/xla/service/buffer_liveness.cc
@@ -244,7 +244,7 @@ bool BufferLiveness::live_range_strictly_before(const LogicalBuffer& a,
   // *) Is a loop fusion instruction (with DynamicUpdateSlice fused root) where
   //    the singleton use of 'a' at 'a.index' is the fused root at operand 0.
   for (const BufferAlias& alias : points_to_analysis_->GetBufferAliases(a)) {
-    if (alias.instruction()->users().count(b.instruction()) > 0 &&
+    if (b.instruction()->IsUserOf(alias.instruction()) &&
         !CanShareOperandBufferWithUser(alias.instruction(), alias.index(),
                                        b.instruction(), b.index(),
                                        points_to_analysis())) {

diff --git a/tensorflow/compiler/xla/service/copy_insertion_test.cc b/tensorflow/compiler/xla/service/copy_insertion_test.cc
@@ -90,16 +90,14 @@ class CopyInsertionTest : public HloTestBase {
   };
 };
 
-#define EXPECT_INST(A, E...) EXPECT_EQ(A, (std::set<HloInstruction*>{E}))
-
 TEST_F(CopyInsertionTest, SingleParameter) {
   auto builder = HloComputation::Builder(TestName());
   HloInstruction* x = builder.AddInstruction(
       HloInstruction::CreateParameter(0, ShapeUtil::MakeShape(F32, {}), "x"));
   HloInstruction* tuple =
       builder.AddInstruction(HloInstruction::CreateTuple({x}));
 
-  EXPECT_INST(x->users(), tuple);
+  ExpectEqUnordered(x->users(), {tuple});
 
   HloModule module(TestName());
   module.AddEntryComputation(builder.Build());
@@ -127,7 +125,7 @@ TEST_F(CopyInsertionTest, SingleConstant) {
   HloInstruction* tuple =
       builder.AddInstruction(HloInstruction::CreateTuple({constant}));
 
-  EXPECT_INST(constant->users(), tuple);
+  ExpectEqUnordered(constant->users(), {tuple});
 
   HloModule module(TestName());
   module.AddEntryComputation(builder.Build());
@@ -221,9 +219,9 @@ TEST_F(CopyInsertionTest, AmbiguousPointsToSet) {
   builder.AddInstruction(HloInstruction::CreateTernary(
       tuple1->shape(), HloOpcode::kSelect, pred, tuple1, tuple2));
 
-  EXPECT_INST(constant1->users(), tuple1);
-  EXPECT_INST(constant2->users(), tuple1, tuple2);
-  EXPECT_INST(constant3->users(), tuple2);
+  ExpectEqUnordered(constant1->users(), {tuple1});
+  ExpectEqUnordered(constant2->users(), {tuple1, tuple2});
+  ExpectEqUnordered(constant3->users(), {tuple2});
 
   HloModule module(TestName());
   module.AddEntryComputation(builder.Build());
@@ -261,7 +259,7 @@ TEST_F(CopyInsertionTest, BitcastParameter) {
   HloModule module(TestName());
   module.AddEntryComputation(builder.Build());
 
-  EXPECT_INST(x->users(), bitcast);
+  ExpectEqUnordered(x->users(), {bitcast});
 
   HloInstruction* old_root = module.entry_computation()->root_instruction();
   InsertCopies(&module);
@@ -289,7 +287,7 @@ TEST_F(CopyInsertionTest, BitcastConstant) {
   HloModule module(TestName());
   module.AddEntryComputation(builder.Build());
 
-  EXPECT_INST(constant->users(), bitcast);
+  ExpectEqUnordered(constant->users(), {bitcast});
 
   HloInstruction* old_root = module.entry_computation()->root_instruction();
   InsertCopies(&module);
@@ -316,8 +314,7 @@ TEST_F(CopyInsertionTest, BitcastTupleElementParameter) {
   HloModule module(TestName());
   module.AddEntryComputation(builder.Build());
 
-  EXPECT_EQ(1, x->user_count());
-  EXPECT_EQ(*x->users().begin(), bitcast);
+  ExpectEqUnordered(x->users(), {bitcast});
 
   HloInstruction* old_root = module.entry_computation()->root_instruction();
   InsertCopies(&module);

diff --git a/tensorflow/compiler/xla/service/gpu/fusion_merger.cc b/tensorflow/compiler/xla/service/gpu/fusion_merger.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/gpu/fusion_merger.h"
 
 #include <algorithm>
+#include <vector>
 
 #include "tensorflow/compiler/xla/service/hlo_cost_analysis.h"
 #include "tensorflow/compiler/xla/service/instruction_fusion.h"
@@ -249,7 +250,7 @@ Status FusionInstructionMerger::HandleFusion(HloInstruction* fusion) {
     return Status::OK();
   }
   // Merge fused instructions from 'fusion' into each user.
-  std::set<HloInstruction*> users = fusion->users();
+  std::vector<HloInstruction*> users = fusion->users();
   for (HloInstruction* user : users) {
     user->MergeFusionInstruction(fusion);
     changed_ = true;

diff --git a/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.cc b/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.cc
@@ -53,6 +53,7 @@ limitations under the License.
 #include "external/llvm/include/llvm/Transforms/IPO/AlwaysInliner.h"
 #include "external/llvm/include/llvm/Transforms/IPO/PassManagerBuilder.h"
 
+#include "external/llvm/include/llvm/Transforms/IPO/Internalize.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/core/lib/core/stringpiece.h"
 #include "tensorflow/core/lib/io/path.h"
@@ -319,9 +320,13 @@ tensorflow::Status LinkLibdeviceIfNecessary(
   VLOG(1) << "Linking with libdevice from: " << libdevice_path;
   std::unique_ptr<llvm::Module> libdevice_module =
       LoadIRModule(libdevice_path, &module->getContext());
-  if (linker.linkInModule(std::move(libdevice_module),
-                          llvm::Linker::Flags::InternalizeLinkedSymbols |
-                              llvm::Linker::Flags::LinkOnlyNeeded)) {
+  if (linker.linkInModule(
+          std::move(libdevice_module), llvm::Linker::Flags::LinkOnlyNeeded,
+          [](Module& M, const StringSet<>& GVS) {
+            internalizeModule(M, [&M, &GVS](const GlobalValue& GV) {
+              return !GV.hasName() || (GVS.count(GV.getName()) == 0);
+            });
+          })) {
     return tensorflow::errors::Internal(tensorflow::strings::StrCat(
         "Error linking libdevice from ", libdevice_path));
   }

diff --git a/tensorflow/compiler/xla/service/heap_simulator.cc b/tensorflow/compiler/xla/service/heap_simulator.cc
@@ -85,7 +85,8 @@ StatusOr<HeapSimulator::Result> HeapSimulator::Run(
       }
       for (const BufferAlias& alias :
            points_to_analysis.GetBufferAliases(*buffer)) {
-        const std::set<HloInstruction*>& users = alias.instruction()->users();
+        const std::vector<HloInstruction*>& users =
+            alias.instruction()->users();
         if (!users.empty()) {
           live_buffers[buffer].insert(users.begin(), users.end());
         }

diff --git a/tensorflow/compiler/xla/service/hlo_computation.cc b/tensorflow/compiler/xla/service/hlo_computation.cc
@@ -142,6 +142,12 @@ Status HloComputation::RemoveInstruction(HloInstruction* instruction) {
   TF_RET_CHECK(instruction->user_count() == 0)
       << "instruction " << instruction->name()
       << " has users and cannot be removed";
+  TF_RET_CHECK(instruction->control_predecessors().empty())
+      << "instruction " << instruction->name()
+      << " has control predecessors and cannot be removed";
+  TF_RET_CHECK(instruction->control_successors().empty())
+      << "instruction " << instruction->name()
+      << " has control successors and cannot be removed";
 
   TF_RET_CHECK(instruction_iterators_.count(instruction) != 0);
   auto inst_it = instruction_iterators_.at(instruction);
@@ -227,7 +233,8 @@ void ComputeComputationPostOrder(
   }
 
   for (auto& instruction : computation->instructions()) {
-    for (auto& called_computation : instruction->MakeCalledComputationsSet()) {
+    for (HloComputation* called_computation :
+         instruction->called_computations()) {
       ComputeComputationPostOrder(called_computation, visited, post_order);
     }
   }
@@ -383,15 +390,6 @@ StatusOr<HloInstruction*> HloComputation::DeepCopyInstruction(
   }
 }
 
-Status HloComputation::AddControlDependency(HloInstruction* predecessor,
-                                            HloInstruction* successor) {
-  TF_RET_CHECK(instruction_iterators_.count(predecessor) > 0);
-  TF_RET_CHECK(instruction_iterators_.count(successor) > 0);
-  successor->AddControlPredecessor(predecessor);
-  predecessor->AddControlSuccessor(successor);
-  return Status::OK();
-}
-
 ProgramShape HloComputation::ComputeProgramShape() const {
   ProgramShape program_shape;
 

diff --git a/tensorflow/compiler/xla/service/hlo_computation.h b/tensorflow/compiler/xla/service/hlo_computation.h
@@ -128,17 +128,6 @@ class HloComputation {
     return instructions_;
   }
 
-  // Add a control dependency between the two instructions in this computation
-  // so that the 'predecessor' is visited before the 'successor' during the DFS
-  // traversal of the computation. Returns an error status if either of the
-  // given instructions does not belong to the current computation.
-  //
-  // This is used to enforce an additional ordering requirement that is not
-  // captured by normal data dependencies, such as ordering among Send or Recv
-  // operations to avoid deadlock.
-  Status AddControlDependency(HloInstruction* predecessor,
-                              HloInstruction* successor);
-
   // Compute and return a post-order of the instructions in the computation. In
   // this order, definitions of values always appear before their uses.
   std::list<HloInstruction*> MakeInstructionPostOrder() const;

diff --git a/tensorflow/compiler/xla/service/hlo_computation_test.cc b/tensorflow/compiler/xla/service/hlo_computation_test.cc
@@ -297,7 +297,7 @@ TEST_F(HloComputationTest, CycleDetection) {
   auto computation = builder.Build();
 
   // Add a control dependency to create a cycle.
-  ASSERT_IS_OK(computation->AddControlDependency(add, negate));
+  ASSERT_IS_OK(add->AddControlDependencyTo(negate));
 
   const auto visitor = [](HloInstruction* instruction) { return Status::OK(); };
   auto visit_status = computation->Accept(visitor);

diff --git a/tensorflow/compiler/xla/service/hlo_graph_dumper.cc b/tensorflow/compiler/xla/service/hlo_graph_dumper.cc
@@ -193,8 +193,6 @@ string InstructionSequenceGraph(
                 instruction->metadata().source_line());
     }
 
-    std::vector<HloComputation*> called_computations;
-
     // Pick different colors or shapes for instructions which are particularly
     // expensive (eg, dot) and those which are unusual in some way or unique
     // (eg, parameter).
@@ -401,7 +399,8 @@ string InstructionSequenceGraph(
     } else {
       // Add a dotted edge between the instruction and any computations that the
       // instruction calls.
-      for (auto* computation : instruction->MakeCalledComputationsSet()) {
+      for (const HloComputation* computation :
+           instruction->called_computations()) {
         string cluster_name = StrCat("cluster_", ComputationId(computation));
         string call_edge = Printf(
             "%s -> %s [ style=dashed; ltail=%s ];\n",