Merge branch 'doitsujin:master' into master

allfoxwy · Aug 14, 2024 · 28fb7c7 · 28fb7c7
2 parents 72077ba + 159f540
commit 28fb7c7
Show file tree

Hide file tree

Showing 17 changed files with 149 additions and 124 deletions.
diff --git a/dxvk.conf b/dxvk.conf
@@ -517,29 +517,6 @@
 # d3d9.forceSwapchainMSAA = -1
 
 
-# Long Mad
-#
-# Should we make our Mads a FFma or do it the long way with an FMul and an FAdd?
-# This solves some rendering bugs in games that have z-pass shaders which
-# don't match entirely to the regular vertex shader in this way.
-#
-# Supported values:
-# - True/False
-
-# d3d11.longMad = False
-# d3d9.longMad = False
-
-
-# Long Dot
-#
-# Whether to emit dot products as an FMA chain or as a plain SPIR-V dot product.
-#
-# Supported values:
-# - True/False
-
-# d3d11.longDot = False
-
-
 # Device Local Constant Buffers
 #
 # Enables using device local, host accessible memory for constant buffers in D3D9.

diff --git a/src/d3d11/d3d11_options.cpp b/src/d3d11/d3d11_options.cpp
@@ -31,8 +31,6 @@ namespace dxvk {
     this->numBackBuffers        = config.getOption<int32_t>("dxgi.numBackBuffers", 0);
     this->maxFrameLatency       = config.getOption<int32_t>("dxgi.maxFrameLatency", 0);
     this->exposeDriverCommandLists = config.getOption<bool>("d3d11.exposeDriverCommandLists", true);
-    this->longMad               = config.getOption<bool>("d3d11.longMad", false);
-    this->longDot               = config.getOption<bool>("d3d11.longDot", false);
     this->reproducibleCommandStream = config.getOption<bool>("d3d11.reproducibleCommandStream", false);
 
     // Clamp LOD bias so that people don't abuse this in unintended ways

diff --git a/src/d3d11/d3d11_options.h b/src/d3d11/d3d11_options.h
@@ -118,12 +118,6 @@ namespace dxvk {
     /// Shader dump path
     std::string shaderDumpPath;
 
-    /// Translate Mad/Dfma to separate FMul+FAdd
-    bool longMad;
-
-    /// Translate DpX to a precise FMul+FFma chain
-    bool longDot;
-
     /// Ensure that for the same D3D commands the output VK commands
     /// don't change between runs. Useful for comparative benchmarking,
     /// can negatively affect performance.

diff --git a/src/d3d9/d3d9_options.cpp b/src/d3d9/d3d9_options.cpp
@@ -67,7 +67,6 @@ namespace dxvk {
     this->forceSampleRateShading        = config.getOption<bool>        ("d3d9.forceSampleRateShading",        false);
     this->forceAspectRatio              = config.getOption<std::string> ("d3d9.forceAspectRatio",              "");
     this->enumerateByDisplays           = config.getOption<bool>        ("d3d9.enumerateByDisplays",           true);
-    this->longMad                       = config.getOption<bool>        ("d3d9.longMad",                       false);
     this->cachedDynamicBuffers          = config.getOption<bool>        ("d3d9.cachedDynamicBuffers",          false);
     this->deviceLocalConstantBuffers    = config.getOption<bool>        ("d3d9.deviceLocalConstantBuffers",    false);
     this->allowDirectBufferMapping      = config.getOption<bool>        ("d3d9.allowDirectBufferMapping",      true);

diff --git a/src/d3d9/d3d9_options.h b/src/d3d9/d3d9_options.h
@@ -119,11 +119,6 @@ namespace dxvk {
     /// Enumerate adapters by displays
     bool enumerateByDisplays;
 
-    /// Should we make our Mads a FFma or do it the long way with an FMul and an FAdd?
-    /// This solves some rendering bugs in games that have z-pass shaders which
-    /// don't match entirely to the regular vertex shader in this way.
-    bool longMad;
-
     /// Cached dynamic buffers: Maps all buffers in cached memory.
     bool cachedDynamicBuffers;
 

diff --git a/src/dxbc/dxbc_compiler.cpp b/src/dxbc/dxbc_compiler.cpp
@@ -1623,7 +1623,9 @@ namespace dxvk {
 
       case DxbcOpcode::Mad:
       case DxbcOpcode::DFma:
-        if (likely(!m_moduleInfo.options.longMad)) {
+        if (ins.controls.precise()) {
+          // FXC only emits precise mad if the shader explicitly uses
+          // the HLSL mad()/fma() intrinsics, let's preserve that.
           dst.id = m_module.opFFma(typeId,
             src.at(0).id, src.at(1).id, src.at(2).id);
         } else {
@@ -2046,37 +2048,25 @@ namespace dxvk {
     dst.type.ccount = 1;
     dst.id = 0;
 
-    if (!m_moduleInfo.options.longDot) {
-      dst.id = m_module.opDot(
-        getVectorTypeId(dst.type),
-        src.at(0).id,
-        src.at(1).id);
+    uint32_t componentType = getVectorTypeId(dst.type);
+    uint32_t componentCount = srcMask.popCount();
 
-      if (ins.controls.precise() || m_precise)
-        m_module.decorate(dst.id, spv::DecorationNoContraction);
-    } else {
-      uint32_t componentType = getVectorTypeId(dst.type);
-      uint32_t componentCount = srcMask.popCount();
-
-      for (uint32_t i = 1; i <= componentCount; i++) {
-        uint32_t idx = componentCount - i;
-
-        if (dst.id) {
-          dst.id = m_module.opFFma(componentType,
-            m_module.opCompositeExtract(componentType, src.at(0).id, 1, &idx),
-            m_module.opCompositeExtract(componentType, src.at(1).id, 1, &idx),
-            dst.id);
-        } else {
-          dst.id = m_module.opFMul(componentType,
-            m_module.opCompositeExtract(componentType, src.at(0).id, 1, &idx),
-            m_module.opCompositeExtract(componentType, src.at(1).id, 1, &idx));
-        }
-
-        // Unconditionally mark as precise since the exact order of operation
-        // matters for some games, even if the instruction itself is not marked
-        // as precise.
-        m_module.decorate(dst.id, spv::DecorationNoContraction);
+    for (uint32_t i = 0; i < componentCount; i++) {
+      if (dst.id) {
+        dst.id = m_module.opFFma(componentType,
+          m_module.opCompositeExtract(componentType, src.at(0).id, 1, &i),
+          m_module.opCompositeExtract(componentType, src.at(1).id, 1, &i),
+          dst.id);
+      } else {
+        dst.id = m_module.opFMul(componentType,
+          m_module.opCompositeExtract(componentType, src.at(0).id, 1, &i),
+          m_module.opCompositeExtract(componentType, src.at(1).id, 1, &i));
       }
+
+      // Unconditionally mark as precise since the exact order of operation
+      // matters for some games, even if the instruction itself is not marked
+      // as precise.
+      m_module.decorate(dst.id, spv::DecorationNoContraction);
     }
 
     dst = emitDstOperandModifiers(dst, ins.modifiers);

diff --git a/src/dxbc/dxbc_options.cpp b/src/dxbc/dxbc_options.cpp
@@ -38,8 +38,6 @@ namespace dxvk {
     disableMsaa              = options.disableMsaa;
     forceSampleRateShading   = options.forceSampleRateShading;
     enableSampleShadingInterlock = device->features().extFragmentShaderInterlock.fragmentShaderSampleInterlock;
-    longMad                  = options.longMad;
-    longDot                  = options.longDot;
 
     // Figure out float control flags to match D3D11 rules
     if (options.floatControls) {

diff --git a/src/dxbc/dxbc_options.h b/src/dxbc/dxbc_options.h
@@ -54,12 +54,6 @@ namespace dxvk {
 
     /// Minimum storage buffer alignment
     VkDeviceSize minSsboAlignment = 0;
-
-    /// Translate Mad/Dfma to separate FMul+FAdd
-    bool longMad;
-
-    /// Translate DpX to a precise FMul+FFma chain
-    bool longDot;
   };
 
 }
diff --git a/src/dxso/dxso_compiler.cpp b/src/dxso/dxso_compiler.cpp
@@ -1932,21 +1932,13 @@ namespace dxvk {
           emitRegisterLoad(src[1], mask).id);
         break;
       case DxsoOpcode::Mad:
-        if (!m_moduleInfo.options.longMad) {
-          result.id = emitFma(
-            emitRegisterLoad(src[0], mask),
-            emitRegisterLoad(src[1], mask),
-            emitRegisterLoad(src[2], mask)).id;
-        }
-        else {
-          result.id = emitMul(
-            emitRegisterLoad(src[0], mask),
-            emitRegisterLoad(src[1], mask)).id;
+        result.id = emitMul(
+          emitRegisterLoad(src[0], mask),
+          emitRegisterLoad(src[1], mask)).id;
 
-          result.id = m_module.opFAdd(typeId,
-            result.id,
-            emitRegisterLoad(src[2], mask).id);
-        }
+        result.id = m_module.opFAdd(typeId,
+          result.id,
+          emitRegisterLoad(src[2], mask).id);
         break;
       case DxsoOpcode::Mul:
         result.id = emitMul(

diff --git a/src/dxso/dxso_options.cpp b/src/dxso/dxso_options.cpp
@@ -29,7 +29,6 @@ namespace dxvk {
 
     vertexFloatConstantBufferAsSSBO = pDevice->GetVertexConstantLayout().floatSize() > devInfo.core.properties.limits.maxUniformBufferRange;
 
-    longMad = options.longMad;
     robustness2Supported = devFeatures.extRobustness2.robustBufferAccess2;
 
     drefScaling         = options.drefScaling;

diff --git a/src/dxso/dxso_options.h b/src/dxso/dxso_options.h
@@ -42,11 +42,6 @@ namespace dxvk {
     /// Should the SWVP float constant buffer be a SSBO (because of the size on NV)
     bool vertexFloatConstantBufferAsSSBO;
 
-    /// Should we make our Mads a FFma or do it the long way with an FMul and an FAdd?
-    /// This solves some rendering bugs in games that have z-pass shaders which
-    /// don't match entirely to the regular vertex shader in this way.
-    bool longMad;
-
     /// Whether or not we can rely on robustness2 to handle oob constant access
     bool robustness2Supported;
 

diff --git a/src/dxvk/dxvk_instance.cpp b/src/dxvk/dxvk_instance.cpp
@@ -182,7 +182,7 @@ namespace dxvk {
       appInfo.pApplicationName      = appName.c_str();
       appInfo.applicationVersion    = flags.raw();
       appInfo.pEngineName           = "DXVK";
-      appInfo.engineVersion         = VK_MAKE_API_VERSION(0, 2, 4, 0);
+      appInfo.engineVersion         = VK_MAKE_API_VERSION(0, 2, 4, 1);
       appInfo.apiVersion            = VK_MAKE_API_VERSION(0, 1, 3, 0);
 
       VkInstanceCreateInfo info = { VK_STRUCTURE_TYPE_INSTANCE_CREATE_INFO };

diff --git a/src/spirv/spirv_code_buffer.cpp b/src/spirv/spirv_code_buffer.cpp
@@ -50,6 +50,18 @@ namespace dxvk {
   }
 
 
+  void SpirvCodeBuffer::append(const SpirvInstruction& ins) {
+    const size_t size = m_code.size();
+
+    m_code.resize(size + ins.length());
+
+    for (uint32_t i = 0; i < ins.length(); i++)
+      m_code[size + i] = ins.arg(i);
+
+    m_ptr += ins.length();
+  }
+
+
   void SpirvCodeBuffer::append(const SpirvCodeBuffer& other) {
     if (other.size() != 0) {
       const size_t size = m_code.size();

diff --git a/src/spirv/spirv_code_buffer.h b/src/spirv/spirv_code_buffer.h
@@ -89,6 +89,14 @@ namespace dxvk {
      */
     uint32_t allocId();
 
+    /**
+     * \brief Appends an instruction
+     *
+     * Slightly faster than individually adding words.
+     * \param [in] ins Instruction
+     */
+    void append(const SpirvInstruction& ins);
+
     /**
      * \brief Merges two code buffers
      * 

diff --git a/src/spirv/spirv_module.cpp b/src/spirv/spirv_module.cpp
@@ -15,7 +15,7 @@ namespace dxvk {
   }
 
 
-  SpirvCodeBuffer SpirvModule::compile() const {
+  SpirvCodeBuffer SpirvModule::compile() {
     SpirvCodeBuffer result;
     result.putHeader(m_version, m_id);
     result.append(m_capabilities);
@@ -28,7 +28,35 @@ namespace dxvk {
     result.append(m_annotations);
     result.append(m_typeConstDefs);
     result.append(m_variables);
-    result.append(m_code);
+
+    // Perform some crude dead code elimination. In some cases, our compilers
+    // may emit invalid code, such as an unreachable block branching to a loop's
+    // continue block, but those cases cannot be reasonably detected up-front.
+    std::unordered_set<uint32_t> reachableBlocks;
+    std::unordered_set<uint32_t> mergeBlocks;
+
+    classifyBlocks(reachableBlocks, mergeBlocks);
+
+    bool reachable = true;
+
+    for (auto ins : m_code) {
+      if (ins.opCode() == spv::OpFunctionEnd) {
+        reachable = true;
+        result.append(ins);
+      } else if (ins.opCode() == spv::OpLabel) {
+        uint32_t labelId = ins.arg(1);
+
+        if ((reachable = reachableBlocks.find(labelId) != reachableBlocks.end())) {
+          result.append(ins);
+        } else if (mergeBlocks.find(labelId) != mergeBlocks.end()) {
+          result.append(ins);
+          result.putIns(spv::OpUnreachable, 1);
+        }
+      } else if (reachable) {
+        result.append(ins);
+      }
+    }
+
     return result;
   }
 
@@ -3905,4 +3933,69 @@ namespace dxvk {
     }
   }
 
+
+  void SpirvModule::classifyBlocks(
+          std::unordered_set<uint32_t>& reachableBlocks,
+          std::unordered_set<uint32_t>& mergeBlocks) {
+    std::unordered_multimap<uint32_t, uint32_t> branches;
+    std::queue<uint32_t> blockQueue;
+
+    uint32_t blockId = 0;
+
+    for (auto ins : m_code) {
+      switch (ins.opCode()) {
+        case spv::OpLabel: {
+          uint32_t id = ins.arg(1);
+
+          if (!blockId)
+            branches.insert({ 0u, id });
+
+          blockId = id;
+        } break;
+
+        case spv::OpFunction: {
+          blockId = 0u;
+        } break;
+
+        case spv::OpBranch: {
+          branches.insert({ blockId, ins.arg(1) });
+        } break;
+
+        case spv::OpBranchConditional: {
+          branches.insert({ blockId, ins.arg(2) });
+          branches.insert({ blockId, ins.arg(3) });
+        } break;
+
+        case spv::OpSwitch: {
+          branches.insert({ blockId, ins.arg(2) });
+
+          for (uint32_t i = 4; i < ins.length(); i += 2)
+            branches.insert({ blockId, ins.arg(i) });
+        } break;
+
+        case spv::OpSelectionMerge:
+        case spv::OpLoopMerge: {
+          mergeBlocks.insert(ins.arg(1));
+        } break;
+
+        default:;
+      }
+    }
+
+    blockQueue.push(0);
+
+    while (!blockQueue.empty()) {
+      uint32_t id = blockQueue.front();
+
+      auto range = branches.equal_range(id);
+
+      for (auto i = range.first; i != range.second; i++) {
+        if (reachableBlocks.insert(i->second).second)
+          blockQueue.push(i->second);
+      }
+
+      blockQueue.pop();
+    }
+  }
+
 }
diff --git a/src/spirv/spirv_module.h b/src/spirv/spirv_module.h
@@ -1,5 +1,7 @@
 #pragma once
 
+#include <queue>
+#include <unordered_map>
 #include <unordered_set>
 
 #include "spirv_code_buffer.h"
@@ -59,7 +61,7 @@ namespace dxvk {
 
     ~SpirvModule();
 
-    SpirvCodeBuffer compile() const;
+    SpirvCodeBuffer compile();
 
     size_t getInsertionPtr() {
       return m_code.getInsertionPtr();
@@ -1326,6 +1328,10 @@ namespace dxvk {
     bool isInterfaceVar(
             spv::StorageClass       sclass) const;
 
+    void classifyBlocks(
+            std::unordered_set<uint32_t>& reachableBlocks,
+            std::unordered_set<uint32_t>& mergeBlocks);
+
   };
 
 }