Skip to content

Commit

Permalink
Integrate ptxas compiler to iree-compile (iree-org#12823)
Browse files Browse the repository at this point in the history
Currently `iree-compile` generates PTX and puts into vmfb file. The PTX
is compiled on the runtime using `cuModuleLoadDataEx`. This is
insufficient in three ways:
- Cannot try different `ptxas` compiler version.
- `ptxas` might behave differently than `cuModuleLoadDataEx`.
- Keeping PTX increases vmfb file size, but also great for jit compile. 

The PR implements followings:
- Introduces three flags in the `iree-compile` :
`--iree-hal-use-ptxas=true/false`, `--iree-hal-use-ptxas-from=<path>`,
`--iree-hal-use-ptxas-params=<params>`
- When `--iree-hal-use-ptxas=true`, compiles the iree generated PTX and
keeps it as `cubin` in the `vmfb` file. This reduces the file size
significantly.
- If `--iree-hal-use-ptxas-from=<path>` is not present, it searches the
`ptxas` from the path. 
- When `--iree-hal-use-ptxas=false`, packs the ptx into vmfb file, and
let the runtime to compile.


Flags can use them like below :
```
iree-compile code.mlir --iree-hal-use-ptxas-from=/usr/local/cuda-11.8/bin/ptxas -o code.vmfb
NOTE: Compiling the generated PTX code
 $ /usr/local/cuda-11.8/bin/ptxas -arch sm_80  /tmp/iree-cuda-ptx-src-1a146b -o /tmp/iree-cuda-ptx-src-1a146b.cubin 2> /tmp/iree-cuda-ptx-log-0de384
```

One can pass `-v` to see extra information such as register spilling,
static shared and local memory usage and etc.
```
iree-compile code.mlir --iree-hal-use-ptxas-from=/usr/local/cuda-11.8/bin/ptxas --iree-hal-use-ptxas-params=-v -o code.vmfb

NOTE: Compiling the generated PTX code
 $ /usr/local/cuda-11.8/bin/ptxas -arch sm_80 -v /tmp/iree-cuda-ptx-src-ceb4df -o /tmp/iree-cuda-ptx-src-ceb4df.cubin 2> /tmp/iree-cuda-ptx-log-dec4da
ptxas info    : 0 bytes gmem
ptxas info    : Compiling entry function 'matmul_dispatch_0_matmul_1024x1024x1024' for 'sm_80'
ptxas info    : Function properties for matmul_dispatch_0_matmul_1024x1024x1024
    0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas info    : Used 188 registers, 376 bytes cmem[0]
```
  • Loading branch information
grypp authored Apr 3, 2023
1 parent e7f98e2 commit 29496b8
Showing 1 changed file with 168 additions and 4 deletions.
172 changes: 168 additions & 4 deletions compiler/src/iree/compiler/Dialect/HAL/Target/CUDA/CUDATarget.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
#include "iree/compiler/Dialect/HAL/Target/TargetRegistry.h"
#include "iree/compiler/Utils/FlatbufferUtils.h"
#include "iree/compiler/Utils/StringUtils.h"
#include "iree/compiler/Utils/ToolUtils.h"
#include "iree/schemas/cuda_executable_def_builder.h"
#include "iree_cuda/libdevice_embedded.h"
#include "llvm/Analysis/TargetTransformInfo.h"
Expand All @@ -25,7 +26,14 @@
#include "llvm/MC/TargetRegistry.h"
#include "llvm/Passes/PassBuilder.h"
#include "llvm/Passes/StandardInstrumentations.h"
#include "llvm/Support/FileSystem.h"
#include "llvm/Support/FileUtilities.h"
#include "llvm/Support/MemoryBuffer.h"
#include "llvm/Support/Path.h"
#include "llvm/Support/Program.h"
#include "llvm/Support/TargetSelect.h"
#include "llvm/Support/WithColor.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/Target/TargetMachine.h"
#include "llvm/Transforms/IPO.h"
#include "llvm/Transforms/IPO/Internalize.h"
Expand All @@ -47,11 +55,165 @@ static llvm::cl::opt<std::string> clTargetChip(
"iree-hal-cuda-llvm-target-arch", llvm::cl::desc("LLVM target chip."),
llvm::cl::init("sm_35"));

static llvm::cl::opt<bool> clUsePtxas(
"iree-hal-cuda-use-ptxas", llvm::cl::init(false),
llvm::cl::desc(
"It uses the ptxas compiler that is on the environment, compiles the "
"generated PTX code with it, puts the cubin binary generated by ptxas "
"into the executable. '--iree-hal-cuda-llvm-target-arch' is used as "
"the target GPU. If passing additional parameters to ptxas is desired, "
"the parameters flag can be used "
"(e.g.'--iree-hal-cuda-use-ptxas-params=-v')."));

static llvm::cl::opt<std::string> clUsePtxasFrom(
"iree-hal-cuda-use-ptxas-from", llvm::cl::init(""),
llvm::cl::desc(
"It uses the provided ptxas compiler, compiles the generated PTX "
"code with it, puts the cubin binary generated by ptxas into the "
"executable. '--iree-hal-cuda-llvm-target-arch' is used as the "
"target GPU. If passing additional parameters to ptxas is desired, the "
"parameters flag can be used "
"(e.g.'--iree-hal-cuda-use-ptxas-params=-v')."));

static llvm::cl::opt<std::string> clUsePtxasParams(
"iree-hal-cuda-use-ptxas-params", llvm::cl::init(""),
llvm::cl::desc("Passes the given additional parameters to ptxas."));

namespace mlir {
namespace iree_compiler {
namespace IREE {
namespace HAL {

static constexpr char kPtxasCompilerName[] = "ptxas";

/// Attempts to find ptxas compiler
static FailureOr<std::string> findPtxasCompiler(std::string *message) {
std::string ptxasCompiler;
if (!clUsePtxasFrom.empty()) ptxasCompiler = clUsePtxasFrom;
if (llvm::sys::fs::exists(ptxasCompiler)) return ptxasCompiler;

ptxasCompiler = findTool(kPtxasCompilerName);
if (llvm::sys::fs::exists(ptxasCompiler)) return ptxasCompiler;

*message = std::string(
"Could not find ptxas compiler. Try passing it explicitly with "
"--iree-hal-cuda-use-ptxas-from=<path> flag");
return failure();
}

/// Compiles the given generated PTX code with the given ptxas compiler.
static FailureOr<std::string> compileWithPtxas(StringRef ptxasCompiler,
StringRef smCapability,
StringRef ptxasParams,
StringRef ptxSource,
std::string *message) {
// Step 1. Create temporary files: ptx source file, log file and cubin file
llvm::SmallString<64> ptxSourceFile, stdinFile, stdoutFile, stderrFile;
llvm::sys::fs::createTemporaryFile("iree-ptx", "", ptxSourceFile);
llvm::sys::fs::createTemporaryFile("ptxas-stdin", "", stdinFile);
llvm::sys::fs::createTemporaryFile("ptxas-stdout", "", stdoutFile);
llvm::sys::fs::createTemporaryFile("ptxas-stderr", "", stderrFile);
std::string cubinFile = std::string(ptxSourceFile) + ".cubin";
llvm::FileRemover stdinRemover(stdinFile.c_str());
llvm::FileRemover stdoutRemover(stdoutFile.c_str());
llvm::FileRemover stderrRemover(stderrFile.c_str());
llvm::FileRemover binRemover(cubinFile.c_str());
llvm::FileRemover srcRemover(ptxSourceFile.c_str());

// Step 2. Write the generated PTX into a file, so we can pass it to ptxas
// compiler
std::error_code ec;
llvm::raw_fd_ostream fPtxSource(ptxSourceFile, ec);
fPtxSource << ptxSource;
fPtxSource.close();
if (fPtxSource.has_error()) {
*message = std::string(
"Could not write the generated ptx into a temporary file\n");
return failure();
}

// Step 3. Build the ptxas command line
std::vector<StringRef> ArgVector{
StringRef(kPtxasCompilerName), StringRef("-arch"), smCapability,
StringRef(ptxSourceFile), StringRef("-o"), StringRef(cubinFile)};
#ifdef _WIN32
auto Tokenize = llvm::cl::TokenizeWindowsCommandLine;
#else
auto Tokenize = llvm::cl::TokenizeGNUCommandLine;
#endif // _WIN32
llvm::BumpPtrAllocator scratchAllocator;
llvm::StringSaver stringSaver(scratchAllocator);
SmallVector<const char *> rawArgs;
Tokenize(ptxasParams, stringSaver, rawArgs, /*MarkEOLs=*/false);
for (auto rawArg : rawArgs) ArgVector.push_back(StringRef(rawArg));

std::optional<StringRef> redirects[] = {
stdinFile.str(),
stdoutFile.str(),
stderrFile.str(),
};

// Step 4. Invoke ptxas
if (llvm::sys::ExecuteAndWait(unescapeCommandLineComponent(ptxasCompiler),
llvm::ArrayRef<llvm::StringRef>(ArgVector),
/*Env=*/std::nullopt,
/*Redirects=*/redirects,
/*SecondsToWait=*/0, /*MemoryLimit=*/0,
/*ErrMsg=*/message)) {
if (message->empty()) {
*message = std::string("Invoking ptxas is failed, see the file: ") +
stderrFile.str().str() + std::string("\n");
}
stderrRemover.releaseFile();
return failure();
}

// Step 5. The output of ptxas if verbose flag is set. This is useful
// because it shows local memory usage, register usage, and etc.
if (ptxasParams.find("-v") != StringRef::npos) {
llvm::ErrorOr<std::unique_ptr<llvm::MemoryBuffer>> maybeFlog =
llvm::MemoryBuffer::getFile(stderrFile);
if (maybeFlog) {
llvm::WithColor::note() << maybeFlog->get()->getBuffer().str();
}
}

// Step 6. Read the cubin file, and return. It will eventually be written
// into executable.
llvm::ErrorOr<std::unique_ptr<llvm::MemoryBuffer>> maybeFcubin =
llvm::MemoryBuffer::getFile(cubinFile);
if (!maybeFcubin) {
*message = std::string("Could not read cubin file \n");
return failure();
}

return std::string(maybeFcubin->get()->getBuffer());
}

// Attempt compiling the PtxImage with ptxas compiler. If the compilation fails
// for some reason return and pack the generated PtxImage code in the
// executable, let the runtime compile.
static std::string produceGpuImage(std::string &ptxImage) {
if (!clUsePtxas) return ptxImage;

std::string message;
FailureOr<std::string> ptxasCompiler = findPtxasCompiler(&message);

if (succeeded(ptxasCompiler)) {
FailureOr<std::string> maybeCubinImage =
compileWithPtxas(ptxasCompiler.value(), clTargetChip, clUsePtxasParams,
ptxImage, &message);
if (succeeded(maybeCubinImage)) return maybeCubinImage.value();
}

llvm::WithColor::warning()
<< "Compilation with `ptxas` failed, the generated ptx will be "
"packaged into the executable and compiled at runtime. \n Error : "
<< message << " \n";

return ptxImage;
}

static void dumpBitcodeToPath(StringRef path, StringRef baseName,
StringRef suffix, StringRef extension,
llvm::Module &module) {
Expand Down Expand Up @@ -383,12 +545,14 @@ class CUDATargetBackend final : public TargetBackend {
variantOp.getName(), ".ptx", ptxImage);
}

std::string gpuImage = produceGpuImage(ptxImage);

FlatbufferBuilder builder;
iree_CUDAExecutableDef_start_as_root(builder);

auto ptxImageRef = flatbuffers_uint8_vec_create(
builder, reinterpret_cast<const uint8_t *>(ptxImage.c_str()),
ptxImage.size());
auto gpuImageRef = flatbuffers_uint8_vec_create(
builder, reinterpret_cast<const uint8_t *>(gpuImage.c_str()),
gpuImage.size());
iree_CUDABlockSizeDef_vec_start(builder);
for (const auto &workgroupSize : workgroupSizes) {
iree_CUDABlockSizeDef_vec_push_create(builder, workgroupSize[0],
Expand All @@ -403,7 +567,7 @@ class CUDATargetBackend final : public TargetBackend {
iree_CUDAExecutableDef_block_sizes_add(builder, blockSizesRef);
iree_CUDAExecutableDef_shared_memory_size_add(builder,
workgroupLocalMemoriesRef);
iree_CUDAExecutableDef_ptx_image_add(builder, ptxImageRef);
iree_CUDAExecutableDef_ptx_image_add(builder, gpuImageRef);
iree_CUDAExecutableDef_end_as_root(builder);

// Add the binary data to the target executable.
Expand Down

0 comments on commit 29496b8

Please sign in to comment.