Integrate ptxas compiler to iree-compile (iree-org#12823)

Currently `iree-compile` generates PTX and puts into vmfb file. The PTX is compiled on the runtime using `cuModuleLoadDataEx`. This is insufficient in three ways: - Cannot try different `ptxas` compiler version. - `ptxas` might behave differently than `cuModuleLoadDataEx`. - Keeping PTX increases vmfb file size, but also great for jit compile. The PR implements followings: - Introduces three flags in the `iree-compile` : `--iree-hal-use-ptxas=true/false`, `--iree-hal-use-ptxas-from=<path>`, `--iree-hal-use-ptxas-params=<params>` - When `--iree-hal-use-ptxas=true`, compiles the iree generated PTX and keeps it as `cubin` in the `vmfb` file. This reduces the file size significantly. - If `--iree-hal-use-ptxas-from=<path>` is not present, it searches the `ptxas` from the path. - When `--iree-hal-use-ptxas=false`, packs the ptx into vmfb file, and let the runtime to compile. Flags can use them like below : ``` iree-compile code.mlir --iree-hal-use-ptxas-from=/usr/local/cuda-11.8/bin/ptxas -o code.vmfb NOTE: Compiling the generated PTX code $ /usr/local/cuda-11.8/bin/ptxas -arch sm_80 /tmp/iree-cuda-ptx-src-1a146b -o /tmp/iree-cuda-ptx-src-1a146b.cubin 2> /tmp/iree-cuda-ptx-log-0de384 ``` One can pass `-v` to see extra information such as register spilling, static shared and local memory usage and etc. ``` iree-compile code.mlir --iree-hal-use-ptxas-from=/usr/local/cuda-11.8/bin/ptxas --iree-hal-use-ptxas-params=-v -o code.vmfb NOTE: Compiling the generated PTX code $ /usr/local/cuda-11.8/bin/ptxas -arch sm_80 -v /tmp/iree-cuda-ptx-src-ceb4df -o /tmp/iree-cuda-ptx-src-ceb4df.cubin 2> /tmp/iree-cuda-ptx-log-dec4da ptxas info : 0 bytes gmem ptxas info : Compiling entry function 'matmul_dispatch_0_matmul_1024x1024x1024' for 'sm_80' ptxas info : Function properties for matmul_dispatch_0_matmul_1024x1024x1024 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 188 registers, 376 bytes cmem[0] ```
Young768 · Apr 3, 2023 · 29496b8 · 29496b8
1 parent e7f98e2
commit 29496b8
Showing 1 changed file with 168 additions and 4 deletions.
diff --git a/compiler/src/iree/compiler/Dialect/HAL/Target/CUDA/CUDATarget.cpp b/compiler/src/iree/compiler/Dialect/HAL/Target/CUDA/CUDATarget.cpp
@@ -12,6 +12,7 @@
 #include "iree/compiler/Dialect/HAL/Target/TargetRegistry.h"
 #include "iree/compiler/Utils/FlatbufferUtils.h"
 #include "iree/compiler/Utils/StringUtils.h"
+#include "iree/compiler/Utils/ToolUtils.h"
 #include "iree/schemas/cuda_executable_def_builder.h"
 #include "iree_cuda/libdevice_embedded.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
@@ -25,7 +26,14 @@
 #include "llvm/MC/TargetRegistry.h"
 #include "llvm/Passes/PassBuilder.h"
 #include "llvm/Passes/StandardInstrumentations.h"
+#include "llvm/Support/FileSystem.h"
+#include "llvm/Support/FileUtilities.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/Path.h"
+#include "llvm/Support/Program.h"
 #include "llvm/Support/TargetSelect.h"
+#include "llvm/Support/WithColor.h"
+#include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Transforms/IPO.h"
 #include "llvm/Transforms/IPO/Internalize.h"
@@ -47,11 +55,165 @@ static llvm::cl::opt<std::string> clTargetChip(
     "iree-hal-cuda-llvm-target-arch", llvm::cl::desc("LLVM target chip."),
     llvm::cl::init("sm_35"));
 
+static llvm::cl::opt<bool> clUsePtxas(
+    "iree-hal-cuda-use-ptxas", llvm::cl::init(false),
+    llvm::cl::desc(
+        "It uses the ptxas compiler that is on the environment, compiles the "
+        "generated PTX code with it, puts the cubin binary generated by ptxas "
+        "into the executable. '--iree-hal-cuda-llvm-target-arch' is used as "
+        "the target GPU. If passing additional parameters to ptxas is desired, "
+        "the parameters flag can be used "
+        "(e.g.'--iree-hal-cuda-use-ptxas-params=-v')."));
+
+static llvm::cl::opt<std::string> clUsePtxasFrom(
+    "iree-hal-cuda-use-ptxas-from", llvm::cl::init(""),
+    llvm::cl::desc(
+        "It uses the provided ptxas compiler, compiles the generated PTX "
+        "code with it, puts the cubin binary generated by ptxas into the "
+        "executable. '--iree-hal-cuda-llvm-target-arch' is used as the "
+        "target GPU. If passing additional parameters to ptxas is desired, the "
+        "parameters flag can be used "
+        "(e.g.'--iree-hal-cuda-use-ptxas-params=-v')."));
+
+static llvm::cl::opt<std::string> clUsePtxasParams(
+    "iree-hal-cuda-use-ptxas-params", llvm::cl::init(""),
+    llvm::cl::desc("Passes the given additional parameters to ptxas."));
+
 namespace mlir {
 namespace iree_compiler {
 namespace IREE {
 namespace HAL {
 
+static constexpr char kPtxasCompilerName[] = "ptxas";
+
+/// Attempts to find ptxas compiler
+static FailureOr<std::string> findPtxasCompiler(std::string *message) {
+  std::string ptxasCompiler;
+  if (!clUsePtxasFrom.empty()) ptxasCompiler = clUsePtxasFrom;
+  if (llvm::sys::fs::exists(ptxasCompiler)) return ptxasCompiler;
+
+  ptxasCompiler = findTool(kPtxasCompilerName);
+  if (llvm::sys::fs::exists(ptxasCompiler)) return ptxasCompiler;
+
+  *message = std::string(
+      "Could not find ptxas compiler. Try passing it explicitly with "
+      "--iree-hal-cuda-use-ptxas-from=<path> flag");
+  return failure();
+}
+
+/// Compiles the given generated PTX code with the given ptxas compiler.
+static FailureOr<std::string> compileWithPtxas(StringRef ptxasCompiler,
+                                               StringRef smCapability,
+                                               StringRef ptxasParams,
+                                               StringRef ptxSource,
+                                               std::string *message) {
+  // Step 1. Create temporary files: ptx source file, log file and cubin file
+  llvm::SmallString<64> ptxSourceFile, stdinFile, stdoutFile, stderrFile;
+  llvm::sys::fs::createTemporaryFile("iree-ptx", "", ptxSourceFile);
+  llvm::sys::fs::createTemporaryFile("ptxas-stdin", "", stdinFile);
+  llvm::sys::fs::createTemporaryFile("ptxas-stdout", "", stdoutFile);
+  llvm::sys::fs::createTemporaryFile("ptxas-stderr", "", stderrFile);
+  std::string cubinFile = std::string(ptxSourceFile) + ".cubin";
+  llvm::FileRemover stdinRemover(stdinFile.c_str());
+  llvm::FileRemover stdoutRemover(stdoutFile.c_str());
+  llvm::FileRemover stderrRemover(stderrFile.c_str());
+  llvm::FileRemover binRemover(cubinFile.c_str());
+  llvm::FileRemover srcRemover(ptxSourceFile.c_str());
+
+  // Step 2. Write the generated PTX into a file, so we can pass it to ptxas
+  // compiler
+  std::error_code ec;
+  llvm::raw_fd_ostream fPtxSource(ptxSourceFile, ec);
+  fPtxSource << ptxSource;
+  fPtxSource.close();
+  if (fPtxSource.has_error()) {
+    *message = std::string(
+        "Could not write the generated ptx into a temporary file\n");
+    return failure();
+  }
+
+  // Step 3. Build the ptxas command line
+  std::vector<StringRef> ArgVector{
+      StringRef(kPtxasCompilerName), StringRef("-arch"), smCapability,
+      StringRef(ptxSourceFile),      StringRef("-o"),    StringRef(cubinFile)};
+#ifdef _WIN32
+  auto Tokenize = llvm::cl::TokenizeWindowsCommandLine;
+#else
+  auto Tokenize = llvm::cl::TokenizeGNUCommandLine;
+#endif  // _WIN32
+  llvm::BumpPtrAllocator scratchAllocator;
+  llvm::StringSaver stringSaver(scratchAllocator);
+  SmallVector<const char *> rawArgs;
+  Tokenize(ptxasParams, stringSaver, rawArgs, /*MarkEOLs=*/false);
+  for (auto rawArg : rawArgs) ArgVector.push_back(StringRef(rawArg));
+
+  std::optional<StringRef> redirects[] = {
+      stdinFile.str(),
+      stdoutFile.str(),
+      stderrFile.str(),
+  };
+
+  // Step 4. Invoke ptxas
+  if (llvm::sys::ExecuteAndWait(unescapeCommandLineComponent(ptxasCompiler),
+                                llvm::ArrayRef<llvm::StringRef>(ArgVector),
+                                /*Env=*/std::nullopt,
+                                /*Redirects=*/redirects,
+                                /*SecondsToWait=*/0, /*MemoryLimit=*/0,
+                                /*ErrMsg=*/message)) {
+    if (message->empty()) {
+      *message = std::string("Invoking ptxas is failed, see the file: ") +
+                 stderrFile.str().str() + std::string("\n");
+    }
+    stderrRemover.releaseFile();
+    return failure();
+  }
+
+  // Step 5. The output of ptxas if verbose flag is set. This is useful
+  // because it shows local memory usage, register usage, and etc.
+  if (ptxasParams.find("-v") != StringRef::npos) {
+    llvm::ErrorOr<std::unique_ptr<llvm::MemoryBuffer>> maybeFlog =
+        llvm::MemoryBuffer::getFile(stderrFile);
+    if (maybeFlog) {
+      llvm::WithColor::note() << maybeFlog->get()->getBuffer().str();
+    }
+  }
+
+  // Step 6. Read the cubin file, and return. It will eventually be written
+  // into executable.
+  llvm::ErrorOr<std::unique_ptr<llvm::MemoryBuffer>> maybeFcubin =
+      llvm::MemoryBuffer::getFile(cubinFile);
+  if (!maybeFcubin) {
+    *message = std::string("Could not read cubin file \n");
+    return failure();
+  }
+
+  return std::string(maybeFcubin->get()->getBuffer());
+}
+
+// Attempt compiling the PtxImage with ptxas compiler. If the compilation fails
+// for some reason return and pack the generated PtxImage code in the
+// executable, let the runtime compile.
+static std::string produceGpuImage(std::string &ptxImage) {
+  if (!clUsePtxas) return ptxImage;
+
+  std::string message;
+  FailureOr<std::string> ptxasCompiler = findPtxasCompiler(&message);
+
+  if (succeeded(ptxasCompiler)) {
+    FailureOr<std::string> maybeCubinImage =
+        compileWithPtxas(ptxasCompiler.value(), clTargetChip, clUsePtxasParams,
+                         ptxImage, &message);
+    if (succeeded(maybeCubinImage)) return maybeCubinImage.value();
+  }
+
+  llvm::WithColor::warning()
+      << "Compilation with `ptxas` failed, the generated ptx will be "
+         "packaged into the executable and compiled at runtime. \n Error : "
+      << message << " \n";
+
+  return ptxImage;
+}
+
 static void dumpBitcodeToPath(StringRef path, StringRef baseName,
                               StringRef suffix, StringRef extension,
                               llvm::Module &module) {
@@ -383,12 +545,14 @@ class CUDATargetBackend final : public TargetBackend {
                      variantOp.getName(), ".ptx", ptxImage);
     }
 
+    std::string gpuImage = produceGpuImage(ptxImage);
+
     FlatbufferBuilder builder;
     iree_CUDAExecutableDef_start_as_root(builder);
 
-    auto ptxImageRef = flatbuffers_uint8_vec_create(
-        builder, reinterpret_cast<const uint8_t *>(ptxImage.c_str()),
-        ptxImage.size());
+    auto gpuImageRef = flatbuffers_uint8_vec_create(
+        builder, reinterpret_cast<const uint8_t *>(gpuImage.c_str()),
+        gpuImage.size());
     iree_CUDABlockSizeDef_vec_start(builder);
     for (const auto &workgroupSize : workgroupSizes) {
       iree_CUDABlockSizeDef_vec_push_create(builder, workgroupSize[0],
@@ -403,7 +567,7 @@ class CUDATargetBackend final : public TargetBackend {
     iree_CUDAExecutableDef_block_sizes_add(builder, blockSizesRef);
     iree_CUDAExecutableDef_shared_memory_size_add(builder,
                                                   workgroupLocalMemoriesRef);
-    iree_CUDAExecutableDef_ptx_image_add(builder, ptxImageRef);
+    iree_CUDAExecutableDef_ptx_image_add(builder, gpuImageRef);
     iree_CUDAExecutableDef_end_as_root(builder);
 
     // Add the binary data to the target executable.