diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index a34cc9683c..50cd770874 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -35,6 +35,8 @@ add_library(core STATIC GPU2D_Soft.cpp GPU3D.cpp GPU3D_Soft.cpp + GPU3D_Texcache.cpp + GPU3D_Texcache.h melonDLDI.h NDS.cpp NDSCart.cpp @@ -78,6 +80,9 @@ if (ENABLE_OGLRENDERER) GPU_OpenGL.cpp GPU_OpenGL_shaders.h GPU3D_OpenGL.cpp + GPU3D_Compute.cpp + GPU3D_TexcacheOpenGL.cpp + GPU3D_TexcacheOpenGL.h GPU3D_OpenGL_shaders.h OpenGLSupport.cpp) diff --git a/src/DMA.cpp b/src/DMA.cpp index 717b38fab5..0fc6cf05f1 100644 --- a/src/DMA.cpp +++ b/src/DMA.cpp @@ -21,6 +21,7 @@ #include "DSi.h" #include "DMA.h" #include "GPU.h" +#include "GPU3D.h" #include "DMA_Timings.h" #include "Platform.h" diff --git a/src/DSi_NDMA.cpp b/src/DSi_NDMA.cpp index fe1f0ba7ed..7c77c9adea 100644 --- a/src/DSi_NDMA.cpp +++ b/src/DSi_NDMA.cpp @@ -22,6 +22,7 @@ #include "DSi_NDMA.h" #include "GPU.h" #include "DSi_AES.h" +#include "GPU3D.h" namespace melonDS { diff --git a/src/GPU.cpp b/src/GPU.cpp index f23e641e48..07a6654e08 100644 --- a/src/GPU.cpp +++ b/src/GPU.cpp @@ -23,7 +23,7 @@ #include "ARMJIT.h" #include "GPU2D_Soft.h" -#include "GPU3D_Soft.h" +#include "GPU3D.h" namespace melonDS { diff --git a/src/GPU2D.cpp b/src/GPU2D.cpp index e0aa630d3b..be6a598798 100644 --- a/src/GPU2D.cpp +++ b/src/GPU2D.cpp @@ -20,6 +20,7 @@ #include #include "NDS.h" #include "GPU.h" +#include "GPU3D.h" namespace melonDS { diff --git a/src/GPU2D_Soft.cpp b/src/GPU2D_Soft.cpp index e01d366597..26f9a875fb 100644 --- a/src/GPU2D_Soft.cpp +++ b/src/GPU2D_Soft.cpp @@ -18,7 +18,7 @@ #include "GPU2D_Soft.h" #include "GPU.h" -#include "GPU3D_OpenGL.h" +#include "GPU3D.h" namespace melonDS { diff --git a/src/GPU3D.cpp b/src/GPU3D.cpp index 47abae2f0c..44561dfa8e 100644 --- a/src/GPU3D.cpp +++ b/src/GPU3D.cpp @@ -24,6 +24,7 @@ #include "FIFO.h" #include "GPU3D_Soft.h" #include "Platform.h" +#include "GPU3D.h" namespace melonDS { diff --git a/src/GPU3D.h b/src/GPU3D.h index 4a5fe6e0c6..f5446f345d 100644 --- a/src/GPU3D.h +++ b/src/GPU3D.h @@ -349,7 +349,14 @@ class Renderer3D virtual void RestartFrame(GPU& gpu) {}; virtual u32* GetLine(int line) = 0; virtual void Blit(const GPU& gpu) {}; + + virtual void SetupAccelFrame() {} virtual void PrepareCaptureFrame() {} + virtual void BindOutputTexture(int buffer) {} + + virtual bool NeedsShaderCompile() { return false; } + virtual void ShaderCompileStep(int& current, int& count) {} + protected: Renderer3D(bool Accelerated); }; diff --git a/src/GPU3D_Compute.cpp b/src/GPU3D_Compute.cpp new file mode 100644 index 0000000000..bf1f4712be --- /dev/null +++ b/src/GPU3D_Compute.cpp @@ -0,0 +1,1136 @@ +/* + Copyright 2016-2022 melonDS team + + This file is part of melonDS. + + melonDS is free software: you can redistribute it and/or modify it under + the terms of the GNU General Public License as published by the Free + Software Foundation, either version 3 of the License, or (at your option) + any later version. + + melonDS is distributed in the hope that it will be useful, but WITHOUT ANY + WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with melonDS. If not, see http://www.gnu.org/licenses/. +*/ + +#include "GPU3D_Compute.h" + +#include + +#include "OpenGLSupport.h" + +#include "GPU3D_Compute_shaders.h" + +namespace melonDS +{ + +ComputeRenderer::ComputeRenderer(GLCompositor&& compositor) + : Renderer3D(true), Texcache(TexcacheOpenGLLoader()), CurGLCompositor(std::move(compositor)) +{} + +bool ComputeRenderer::CompileShader(GLuint& shader, const std::string& source, const std::initializer_list& defines) +{ + std::string shaderName; + std::string shaderSource; + shaderSource += "#version 430 core\n"; + for (const char* define : defines) + { + shaderSource += "#define "; + shaderSource += define; + shaderSource += '\n'; + shaderName += define; + shaderName += ','; + } + shaderSource += "#define ScreenWidth "; + shaderSource += std::to_string(ScreenWidth); + shaderSource += "\n#define ScreenHeight "; + shaderSource += std::to_string(ScreenHeight); + shaderSource += "\n#define MaxWorkTiles "; + shaderSource += std::to_string(MaxWorkTiles); + + shaderSource += ComputeRendererShaders::Common; + shaderSource += source; + + return OpenGL::CompileComputeProgram(shader, shaderSource.c_str(), shaderName.c_str()); +} + +void ComputeRenderer::ShaderCompileStep(int& current, int& count) +{ + current = ShaderStepIdx; + ShaderStepIdx++; + count = 33; + switch (current) + { + case 0: + CompileShader(ShaderInterpXSpans[0], ComputeRendererShaders::InterpSpans, {"InterpSpans", "ZBuffer"}); + return; + case 1: + CompileShader(ShaderInterpXSpans[1], ComputeRendererShaders::InterpSpans, {"InterpSpans", "WBuffer"}); + return; + case 2: + CompileShader(ShaderBinCombined, ComputeRendererShaders::BinCombined, {"BinCombined"}); + return; + case 3: + CompileShader(ShaderDepthBlend[0], ComputeRendererShaders::DepthBlend, {"DepthBlend", "ZBuffer"}); + return; + case 4: + CompileShader(ShaderDepthBlend[1], ComputeRendererShaders::DepthBlend, {"DepthBlend", "WBuffer"}); + return; + case 5: + CompileShader(ShaderRasteriseNoTexture[0], ComputeRendererShaders::Rasterise, {"Rasterise", "ZBuffer", "NoTexture"}); + return; + case 6: + CompileShader(ShaderRasteriseNoTexture[1], ComputeRendererShaders::Rasterise, {"Rasterise", "WBuffer", "NoTexture"}); + return; + case 7: + CompileShader(ShaderRasteriseNoTextureToon[0], ComputeRendererShaders::Rasterise, {"Rasterise", "ZBuffer", "NoTexture", "Toon"}); + return; + case 8: + CompileShader(ShaderRasteriseNoTextureToon[1], ComputeRendererShaders::Rasterise, {"Rasterise", "WBuffer", "NoTexture", "Toon"}); + return; + case 9: + CompileShader(ShaderRasteriseNoTextureHighlight[0], ComputeRendererShaders::Rasterise, {"Rasterise", "ZBuffer", "NoTexture", "Highlight"}); + return; + case 10: + CompileShader(ShaderRasteriseNoTextureHighlight[1], ComputeRendererShaders::Rasterise, {"Rasterise", "WBuffer", "NoTexture", "Highlight"}); + return; + case 11: + CompileShader(ShaderRasteriseUseTextureDecal[0], ComputeRendererShaders::Rasterise, {"Rasterise", "ZBuffer", "UseTexture", "Decal"}); + return; + case 12: + CompileShader(ShaderRasteriseUseTextureDecal[1], ComputeRendererShaders::Rasterise, {"Rasterise", "WBuffer", "UseTexture", "Decal"}); + return; + case 13: + CompileShader(ShaderRasteriseUseTextureModulate[0], ComputeRendererShaders::Rasterise, {"Rasterise", "ZBuffer", "UseTexture", "Modulate"}); + return; + case 14: + CompileShader(ShaderRasteriseUseTextureModulate[1], ComputeRendererShaders::Rasterise, {"Rasterise", "WBuffer", "UseTexture", "Modulate"}); + return; + case 15: + CompileShader(ShaderRasteriseUseTextureToon[0], ComputeRendererShaders::Rasterise, {"Rasterise", "ZBuffer", "UseTexture", "Toon"}); + return; + case 16: + CompileShader(ShaderRasteriseUseTextureToon[1], ComputeRendererShaders::Rasterise, {"Rasterise", "WBuffer", "UseTexture", "Toon"}); + return; + case 17: + CompileShader(ShaderRasteriseUseTextureHighlight[0], ComputeRendererShaders::Rasterise, {"Rasterise", "ZBuffer", "UseTexture", "Highlight"}); + return; + case 18: + CompileShader(ShaderRasteriseUseTextureHighlight[1], ComputeRendererShaders::Rasterise, {"Rasterise", "WBuffer", "UseTexture", "Highlight"}); + return; + case 19: + CompileShader(ShaderRasteriseShadowMask[0], ComputeRendererShaders::Rasterise, {"Rasterise", "ZBuffer", "ShadowMask"}); + return; + case 20: + CompileShader(ShaderRasteriseShadowMask[1], ComputeRendererShaders::Rasterise, {"Rasterise", "WBuffer", "ShadowMask"}); + return; + case 21: + CompileShader(ShaderClearCoarseBinMask, ComputeRendererShaders::ClearCoarseBinMask, {"ClearCoarseBinMask"}); + return; + case 22: + CompileShader(ShaderClearIndirectWorkCount, ComputeRendererShaders::ClearIndirectWorkCount, {"ClearIndirectWorkCount"}); + return; + case 23: + CompileShader(ShaderCalculateWorkListOffset, ComputeRendererShaders::CalcOffsets, {"CalculateWorkOffsets"}); + return; + case 24: + CompileShader(ShaderSortWork, ComputeRendererShaders::SortWork, {"SortWork"}); + return; + case 25: + CompileShader(ShaderFinalPass[0], ComputeRendererShaders::FinalPass, {"FinalPass"}); + return; + case 26: + CompileShader(ShaderFinalPass[1], ComputeRendererShaders::FinalPass, {"FinalPass", "EdgeMarking"}); + return; + case 27: + CompileShader(ShaderFinalPass[2], ComputeRendererShaders::FinalPass, {"FinalPass", "Fog"}); + return; + case 28: + CompileShader(ShaderFinalPass[3], ComputeRendererShaders::FinalPass, {"FinalPass", "EdgeMarking", "Fog"}); + return; + case 29: + CompileShader(ShaderFinalPass[4], ComputeRendererShaders::FinalPass, {"FinalPass", "AntiAliasing"}); + return; + case 30: + CompileShader(ShaderFinalPass[5], ComputeRendererShaders::FinalPass, {"FinalPass", "AntiAliasing", "EdgeMarking"}); + return; + case 31: + CompileShader(ShaderFinalPass[6], ComputeRendererShaders::FinalPass, {"FinalPass", "AntiAliasing", "Fog"}); + return; + case 32: + CompileShader(ShaderFinalPass[7], ComputeRendererShaders::FinalPass, {"FinalPass", "AntiAliasing", "EdgeMarking", "Fog"}); + return; + default: + __builtin_unreachable(); + return; + } +} + +void blah(GLenum source, GLenum type, GLuint id, GLenum severity, GLsizei length, const GLchar *message, const void *userParam) +{ + printf("%s\n", message); +} + +std::unique_ptr ComputeRenderer::New() +{ + std::optional compositor = GLCompositor::New(); + if (!compositor) + return nullptr; + + std::unique_ptr result = std::unique_ptr(new ComputeRenderer(std::move(*compositor))); + + //glDebugMessageCallback(blah, NULL); + //glEnable(GL_DEBUG_OUTPUT); + glGenBuffers(1, &result->YSpanSetupMemory); + glBindBuffer(GL_SHADER_STORAGE_BUFFER, result->YSpanSetupMemory); + glBufferData(GL_SHADER_STORAGE_BUFFER, sizeof(SpanSetupY)*MaxYSpanSetups, nullptr, GL_DYNAMIC_DRAW); + + glGenBuffers(1, &result->RenderPolygonMemory); + glBindBuffer(GL_SHADER_STORAGE_BUFFER, result->RenderPolygonMemory); + glBufferData(GL_SHADER_STORAGE_BUFFER, sizeof(RenderPolygon)*2048, nullptr, GL_DYNAMIC_DRAW); + + glGenBuffers(1, &result->XSpanSetupMemory); + glGenBuffers(1, &result->BinResultMemory); + glGenBuffers(1, &result->FinalTileMemory); + glGenBuffers(1, &result->YSpanIndicesTextureMemory); + glGenBuffers(tilememoryLayer_Num, result->TileMemory); + glGenBuffers(1, &result->WorkDescMemory); + + glGenTextures(1, &result->YSpanIndicesTexture); + glGenTextures(1, &result->LowResFramebuffer); + glBindTexture(GL_TEXTURE_2D, result->LowResFramebuffer); + glTexStorage2D(GL_TEXTURE_2D, 1, GL_RGBA8UI, 256, 192); + + glGenBuffers(1, &result->MetaUniformMemory); + glBindBuffer(GL_UNIFORM_BUFFER, result->MetaUniformMemory); + glBufferData(GL_UNIFORM_BUFFER, sizeof(MetaUniform), nullptr, GL_DYNAMIC_DRAW); + + glGenSamplers(9, result->Samplers); + for (u32 j = 0; j < 3; j++) + { + for (u32 i = 0; i < 3; i++) + { + const GLenum translateWrapMode[3] = {GL_CLAMP_TO_EDGE, GL_REPEAT, GL_MIRRORED_REPEAT}; + glSamplerParameteri(result->Samplers[i+j*3], GL_TEXTURE_WRAP_S, translateWrapMode[i]); + glSamplerParameteri(result->Samplers[i+j*3], GL_TEXTURE_WRAP_T, translateWrapMode[j]); + glSamplerParameteri(result->Samplers[i+j*3], GL_TEXTURE_MIN_FILTER, GL_NEAREST); + glSamplerParameterf(result->Samplers[i+j*3], GL_TEXTURE_MAG_FILTER, GL_NEAREST); + } + } + + glGenBuffers(1, &result->PixelBuffer); + glBindBuffer(GL_PIXEL_PACK_BUFFER, result->PixelBuffer); + glBufferData(GL_PIXEL_PACK_BUFFER, 256*192*4, NULL, GL_DYNAMIC_READ); + + return result; +} + +ComputeRenderer::~ComputeRenderer() +{ + Texcache.Reset(); + + glDeleteBuffers(1, &YSpanSetupMemory); + glDeleteBuffers(1, &RenderPolygonMemory); + glDeleteBuffers(1, &XSpanSetupMemory); + glDeleteBuffers(1, &BinResultMemory); + glDeleteBuffers(tilememoryLayer_Num, TileMemory); + glDeleteBuffers(1, &WorkDescMemory); + glDeleteBuffers(1, &FinalTileMemory); + glDeleteBuffers(1, &YSpanIndicesTextureMemory); + glDeleteTextures(1, &YSpanIndicesTexture); + glDeleteTextures(1, &Framebuffer); + glDeleteBuffers(1, &MetaUniformMemory); + + glDeleteSamplers(9, Samplers); + glDeleteBuffers(1, &PixelBuffer); +} + +void ComputeRenderer::DeleteShaders() +{ + std::initializer_list allPrograms = + { + ShaderInterpXSpans[0], + ShaderInterpXSpans[1], + ShaderBinCombined, + ShaderDepthBlend[0], + ShaderDepthBlend[1], + ShaderRasteriseNoTexture[0], + ShaderRasteriseNoTexture[1], + ShaderRasteriseNoTextureToon[0], + ShaderRasteriseNoTextureToon[1], + ShaderRasteriseNoTextureHighlight[0], + ShaderRasteriseNoTextureHighlight[1], + ShaderRasteriseUseTextureDecal[0], + ShaderRasteriseUseTextureDecal[1], + ShaderRasteriseUseTextureModulate[0], + ShaderRasteriseUseTextureModulate[1], + ShaderRasteriseUseTextureToon[0], + ShaderRasteriseUseTextureToon[1], + ShaderRasteriseUseTextureHighlight[0], + ShaderRasteriseUseTextureHighlight[1], + ShaderRasteriseShadowMask[0], + ShaderRasteriseShadowMask[1], + ShaderClearCoarseBinMask, + ShaderClearIndirectWorkCount, + ShaderCalculateWorkListOffset, + ShaderSortWork, + ShaderFinalPass[0], + ShaderFinalPass[1], + ShaderFinalPass[2], + ShaderFinalPass[3], + ShaderFinalPass[4], + ShaderFinalPass[5], + ShaderFinalPass[6], + ShaderFinalPass[7], + }; + for (GLuint program : allPrograms) + glDeleteProgram(program); +} + +void ComputeRenderer::Reset(GPU& gpu) +{ + Texcache.Reset(); +} + +void ComputeRenderer::SetRenderSettings(int scale, bool highResolutionCoordinates) +{ + CurGLCompositor.SetScaleFactor(scale); + + if (ScaleFactor != -1) + { + DeleteShaders(); + } + + ShaderStepIdx = 0; + + ScaleFactor = scale; + ScreenWidth = 256 * ScaleFactor; + ScreenHeight = 192 * ScaleFactor; + + TilesPerLine = ScreenWidth/TileSize; + TileLines = ScreenHeight/TileSize; + + HiresCoordinates = highResolutionCoordinates; + + MaxWorkTiles = TilesPerLine*TileLines*8; + + for (int i = 0; i < tilememoryLayer_Num; i++) + { + glBindBuffer(GL_SHADER_STORAGE_BUFFER, TileMemory[i]); + glBufferData(GL_SHADER_STORAGE_BUFFER, 4*TileSize*TileSize*MaxWorkTiles, nullptr, GL_DYNAMIC_DRAW); + } + + glBindBuffer(GL_SHADER_STORAGE_BUFFER, FinalTileMemory); + glBufferData(GL_SHADER_STORAGE_BUFFER, 4*3*2*ScreenWidth*ScreenHeight, nullptr, GL_DYNAMIC_DRAW); + + int binResultSize = sizeof(BinResultHeader) + + TilesPerLine*TileLines*CoarseBinStride*4 // BinnedMaskCoarse + + TilesPerLine*TileLines*BinStride*4 // BinnedMask + + TilesPerLine*TileLines*BinStride*4; // WorkOffsets + glBindBuffer(GL_SHADER_STORAGE_BUFFER, BinResultMemory); + glBufferData(GL_SHADER_STORAGE_BUFFER, binResultSize, nullptr, GL_DYNAMIC_DRAW); + + glBindBuffer(GL_SHADER_STORAGE_BUFFER, WorkDescMemory); + glBufferData(GL_SHADER_STORAGE_BUFFER, MaxWorkTiles*2*4*2, nullptr, GL_DYNAMIC_DRAW); + + if (Framebuffer != 0) + glDeleteTextures(1, &Framebuffer); + glGenTextures(1, &Framebuffer); + glBindTexture(GL_TEXTURE_2D, Framebuffer); + glTexStorage2D(GL_TEXTURE_2D, 1, GL_RGBA8, ScreenWidth, ScreenHeight); + + // eh those are pretty bad guesses + // though real hw shouldn't be eable to render all 2048 polygons on every line either + int maxYSpanIndices = 64*2048 * ScaleFactor; + YSpanIndices.resize(maxYSpanIndices); + + glBindBuffer(GL_TEXTURE_BUFFER, YSpanIndicesTextureMemory); + glBufferData(GL_TEXTURE_BUFFER, maxYSpanIndices*2*4, nullptr, GL_DYNAMIC_DRAW); + + glBindBuffer(GL_SHADER_STORAGE_BUFFER, XSpanSetupMemory); + glBufferData(GL_SHADER_STORAGE_BUFFER, sizeof(SpanSetupX)*maxYSpanIndices, nullptr, GL_DYNAMIC_DRAW); + + glBindTexture(GL_TEXTURE_BUFFER, YSpanIndicesTexture); + glTexBuffer(GL_TEXTURE_BUFFER, GL_RGBA16UI, YSpanIndicesTextureMemory); +} + +void ComputeRenderer::VCount144(GPU& gpu) +{ + +} + +void ComputeRenderer::SetupAttrs(SpanSetupY* span, Polygon* poly, int from, int to) +{ + span->Z0 = poly->FinalZ[from]; + span->W0 = poly->FinalW[from]; + span->Z1 = poly->FinalZ[to]; + span->W1 = poly->FinalW[to]; + span->ColorR0 = poly->Vertices[from]->FinalColor[0]; + span->ColorG0 = poly->Vertices[from]->FinalColor[1]; + span->ColorB0 = poly->Vertices[from]->FinalColor[2]; + span->ColorR1 = poly->Vertices[to]->FinalColor[0]; + span->ColorG1 = poly->Vertices[to]->FinalColor[1]; + span->ColorB1 = poly->Vertices[to]->FinalColor[2]; + span->TexcoordU0 = poly->Vertices[from]->TexCoords[0]; + span->TexcoordV0 = poly->Vertices[from]->TexCoords[1]; + span->TexcoordU1 = poly->Vertices[to]->TexCoords[0]; + span->TexcoordV1 = poly->Vertices[to]->TexCoords[1]; +} + +void ComputeRenderer::SetupYSpanDummy(RenderPolygon* rp, SpanSetupY* span, Polygon* poly, int vertex, int side, s32 positions[10][2]) +{ + s32 x0 = positions[vertex][0]; + if (side) + { + span->DxInitial = -0x40000; + x0--; + } + else + { + span->DxInitial = 0; + } + + span->X0 = span->X1 = x0; + span->XMin = x0; + span->XMax = x0; + span->Y0 = span->Y1 = positions[vertex][1]; + + if (span->XMin < rp->XMin) + { + rp->XMin = span->XMin; + rp->XMinY = span->Y0; + } + if (span->XMax > rp->XMax) + { + rp->XMax = span->XMax; + rp->XMaxY = span->Y0; + } + + span->Increment = 0; + + span->I0 = span->I1 = span->IRecip = 0; + span->Linear = true; + + span->XCovIncr = 0; + + span->IsDummy = true; + + SetupAttrs(span, poly, vertex, vertex); +} + +void ComputeRenderer::SetupYSpan(RenderPolygon* rp, SpanSetupY* span, Polygon* poly, int from, int to, int side, s32 positions[10][2]) +{ + span->X0 = positions[from][0]; + span->X1 = positions[to][0]; + span->Y0 = positions[from][1]; + span->Y1 = positions[to][1]; + + SetupAttrs(span, poly, from, to); + + s32 minXY, maxXY; + bool negative = false; + if (span->X1 > span->X0) + { + span->XMin = span->X0; + span->XMax = span->X1-1; + + minXY = span->Y0; + maxXY = span->Y1; + } + else if (span->X1 < span->X0) + { + span->XMin = span->X1; + span->XMax = span->X0-1; + negative = true; + + minXY = span->Y1; + maxXY = span->Y0; + } + else + { + span->XMin = span->X0; + if (side) span->XMin--; + span->XMax = span->XMin; + + // doesn't matter for completely vertical slope + minXY = span->Y0; + maxXY = span->Y0; + } + + if (span->XMin < rp->XMin) + { + rp->XMin = span->XMin; + rp->XMinY = minXY; + } + if (span->XMax > rp->XMax) + { + rp->XMax = span->XMax; + rp->XMaxY = maxXY; + } + + span->IsDummy = false; + + s32 xlen = span->XMax+1 - span->XMin; + s32 ylen = span->Y1 - span->Y0; + + // slope increment has a 18-bit fractional part + // note: for some reason, x/y isn't calculated directly, + // instead, 1/y is calculated and then multiplied by x + // TODO: this is still not perfect (see for example x=169 y=33) + if (ylen == 0) + { + span->Increment = 0; + } + else if (ylen == xlen) + { + span->Increment = 0x40000; + } + else + { + s32 yrecip = (1<<18) / ylen; + span->Increment = (span->X1-span->X0) * yrecip; + if (span->Increment < 0) span->Increment = -span->Increment; + } + + bool xMajor = (span->Increment > 0x40000); + + if (side) + { + // right + + if (xMajor) + span->DxInitial = negative ? (0x20000 + 0x40000) : (span->Increment - 0x20000); + else if (span->Increment != 0) + span->DxInitial = negative ? 0x40000 : 0; + else + span->DxInitial = -0x40000; + } + else + { + // left + + if (xMajor) + span->DxInitial = negative ? ((span->Increment - 0x20000) + 0x40000) : 0x20000; + else if (span->Increment != 0) + span->DxInitial = negative ? 0x40000 : 0; + else + span->DxInitial = 0; + } + + if (xMajor) + { + if (side) + { + span->I0 = span->X0 - 1; + span->I1 = span->X1 - 1; + } + else + { + span->I0 = span->X0; + span->I1 = span->X1; + } + + // used for calculating AA coverage + span->XCovIncr = (ylen << 10) / xlen; + } + else + { + span->I0 = span->Y0; + span->I1 = span->Y1; + } + + if (span->I0 != span->I1) + span->IRecip = (1<<30) / (span->I1 - span->I0); + else + span->IRecip = 0; + + span->Linear = (span->W0 == span->W1) && !(span->W0 & 0x7E) && !(span->W1 & 0x7E); + + if ((span->W0 & 0x1) && !(span->W1 & 0x1)) + { + span->W0n = (span->W0 - 1) >> 1; + span->W0d = (span->W0 + 1) >> 1; + span->W1d = span->W1 >> 1; + } + else + { + span->W0n = span->W0 >> 1; + span->W0d = span->W0 >> 1; + span->W1d = span->W1 >> 1; + } +} + +struct Variant +{ + GLuint Texture, Sampler; + u16 Width, Height; + u8 BlendMode; + + bool operator==(const Variant& other) + { + return Texture == other.Texture && Sampler == other.Sampler && BlendMode == other.BlendMode; + } +}; + +/* + Antialiasing + W-Buffer + With Texture + 0 + 1, 3 + 2 + without Texture + 2 + 0, 1, 3 + + => 20 Shader + 1x Shadow Mask +*/ + +void ComputeRenderer::RenderFrame(GPU& gpu) +{ + if (!Texcache.Update(gpu) && gpu.GPU3D.RenderFrameIdentical) + { + return; + } + + int numYSpans = 0; + int numSetupIndices = 0; + + /* + Some games really like to spam small textures, often + to store the data like PPU tiles. E.g. Shantae + or some Mega Man game. Fortunately they are usually kind + enough to not vary the texture size all too often (usually + they just use 8x8 or 16x for everything). + + This is the reason we have this whole mess where textures of + the same size are put into array textures. This allows + to increase the batch size. + Less variance between each Variant hah! + */ + u32 numVariants = 0, prevVariant, prevTexLayer; + Variant variants[MaxVariants]; + + bool enableTextureMaps = gpu.GPU3D.RenderDispCnt & (1<<0); + + for (int i = 0; i < gpu.GPU3D.RenderNumPolygons; i++) + { + Polygon* polygon = gpu.GPU3D.RenderPolygonRAM[i]; + + u32 nverts = polygon->NumVertices; + u32 vtop = polygon->VTop, vbot = polygon->VBottom; + + u32 curVL = vtop, curVR = vtop; + u32 nextVL, nextVR; + + RenderPolygons[i].FirstXSpan = numSetupIndices; + RenderPolygons[i].Attr = polygon->Attr; + + bool foundVariant = false; + if (i > 0) + { + // if the whole texture attribute matches + // the texture layer will also match + Polygon* prevPolygon = gpu.GPU3D.RenderPolygonRAM[i - 1]; + foundVariant = prevPolygon->TexParam == polygon->TexParam + && prevPolygon->TexPalette == polygon->TexPalette + && (prevPolygon->Attr & 0x30) == (polygon->Attr & 0x30) + && prevPolygon->IsShadowMask == polygon->IsShadowMask; + } + + if (!foundVariant) + { + Variant variant; + variant.BlendMode = polygon->IsShadowMask ? 4 : ((polygon->Attr >> 4) & 0x3); + variant.Texture = 0; + variant.Sampler = 0; + u32* textureLastVariant = nullptr; + // we always need to look up the texture to get the layer of the array texture + if (enableTextureMaps && (polygon->TexParam >> 26) & 0x7) + { + Texcache.GetTexture(gpu, polygon->TexParam, polygon->TexPalette, variant.Texture, prevTexLayer, textureLastVariant); + bool wrapS = (polygon->TexParam >> 16) & 1; + bool wrapT = (polygon->TexParam >> 17) & 1; + bool mirrorS = (polygon->TexParam >> 18) & 1; + bool mirrorT = (polygon->TexParam >> 19) & 1; + variant.Sampler = Samplers[(wrapS ? (mirrorS ? 2 : 1) : 0) + (wrapT ? (mirrorT ? 2 : 1) : 0) * 3]; + + if (*textureLastVariant < numVariants && variants[*textureLastVariant] == variant) + { + foundVariant = true; + prevVariant = *textureLastVariant; + } + } + + if (!foundVariant) + { + for (int j = numVariants - 1; j >= 0; j--) + { + if (variants[j] == variant) + { + foundVariant = true; + prevVariant = j; + goto foundVariant; + } + } + + prevVariant = numVariants; + variants[numVariants] = variant; + variants[numVariants].Width = TextureWidth(polygon->TexParam); + variants[numVariants].Height = TextureHeight(polygon->TexParam); + numVariants++; + assert(numVariants <= MaxVariants); + foundVariant:; + + if (textureLastVariant) + *textureLastVariant = prevVariant; + } + } + RenderPolygons[i].Variant = prevVariant; + RenderPolygons[i].TextureLayer = (float)prevTexLayer; + + if (polygon->FacingView) + { + nextVL = curVL + 1; + if (nextVL >= nverts) nextVL = 0; + nextVR = curVR - 1; + if ((s32)nextVR < 0) nextVR = nverts - 1; + } + else + { + nextVL = curVL - 1; + if ((s32)nextVL < 0) nextVL = nverts - 1; + nextVR = curVR + 1; + if (nextVR >= nverts) nextVR = 0; + } + + s32 scaledPositions[10][2]; + s32 ytop = ScreenHeight, ybot = 0; + for (int i = 0; i < polygon->NumVertices; i++) + { + if (HiresCoordinates) + { + scaledPositions[i][0] = (polygon->Vertices[i]->HiresPosition[0] * ScaleFactor) >> 4; + scaledPositions[i][1] = (polygon->Vertices[i]->HiresPosition[1] * ScaleFactor) >> 4; + } + else + { + scaledPositions[i][0] = polygon->Vertices[i]->FinalPosition[0] * ScaleFactor; + scaledPositions[i][1] = polygon->Vertices[i]->FinalPosition[1] * ScaleFactor; + } + ytop = std::min(scaledPositions[i][1], ytop); + ybot = std::max(scaledPositions[i][1], ybot); + } + RenderPolygons[i].YTop = ytop; + RenderPolygons[i].YBot = ybot; + RenderPolygons[i].XMin = ScreenWidth; + RenderPolygons[i].XMax = 0; + + if (ybot == ytop) + { + vtop = 0; vbot = 0; + + RenderPolygons[i].YBot++; + + int j = 1; + if (scaledPositions[j][0] < scaledPositions[vtop][0]) vtop = j; + if (scaledPositions[j][0] > scaledPositions[vbot][0]) vbot = j; + + j = nverts - 1; + if (scaledPositions[j][0] < scaledPositions[vtop][0]) vtop = j; + if (scaledPositions[j][0] > scaledPositions[vbot][0]) vbot = j; + + assert(numYSpans < MaxYSpanSetups); + u32 curSpanL = numYSpans; + SetupYSpanDummy(&RenderPolygons[i], &YSpanSetups[numYSpans++], polygon, vtop, 0, scaledPositions); + assert(numYSpans < MaxYSpanSetups); + u32 curSpanR = numYSpans; + SetupYSpanDummy(&RenderPolygons[i], &YSpanSetups[numYSpans++], polygon, vbot, 1, scaledPositions); + + YSpanIndices[numSetupIndices].PolyIdx = i; + YSpanIndices[numSetupIndices].SpanIdxL = curSpanL; + YSpanIndices[numSetupIndices].SpanIdxR = curSpanR; + YSpanIndices[numSetupIndices].Y = ytop; + numSetupIndices++; + } + else + { + u32 curSpanL = numYSpans; + assert(numYSpans < MaxYSpanSetups); + SetupYSpan(&RenderPolygons[i], &YSpanSetups[numYSpans++], polygon, curVL, nextVL, 0, scaledPositions); + u32 curSpanR = numYSpans; + assert(numYSpans < MaxYSpanSetups); + SetupYSpan(&RenderPolygons[i], &YSpanSetups[numYSpans++], polygon, curVR, nextVR, 1, scaledPositions); + + for (u32 y = ytop; y < ybot; y++) + { + if (y >= scaledPositions[nextVL][1] && curVL != polygon->VBottom) + { + while (y >= scaledPositions[nextVL][1] && curVL != polygon->VBottom) + { + curVL = nextVL; + if (polygon->FacingView) + { + nextVL = curVL + 1; + if (nextVL >= nverts) + nextVL = 0; + } + else + { + nextVL = curVL - 1; + if ((s32)nextVL < 0) + nextVL = nverts - 1; + } + } + + + assert(numYSpans < MaxYSpanSetups); + curSpanL = numYSpans; + SetupYSpan(&RenderPolygons[i], &YSpanSetups[numYSpans++], polygon, curVL, nextVL, 0, scaledPositions); + } + if (y >= scaledPositions[nextVR][1] && curVR != polygon->VBottom) + { + while (y >= scaledPositions[nextVR][1] && curVR != polygon->VBottom) + { + curVR = nextVR; + if (polygon->FacingView) + { + nextVR = curVR - 1; + if ((s32)nextVR < 0) + nextVR = nverts - 1; + } + else + { + nextVR = curVR + 1; + if (nextVR >= nverts) + nextVR = 0; + } + } + + assert(numYSpans < MaxYSpanSetups); + curSpanR = numYSpans; + SetupYSpan(&RenderPolygons[i] ,&YSpanSetups[numYSpans++], polygon, curVR, nextVR, 1, scaledPositions); + } + + YSpanIndices[numSetupIndices].PolyIdx = i; + YSpanIndices[numSetupIndices].SpanIdxL = curSpanL; + YSpanIndices[numSetupIndices].SpanIdxR = curSpanR; + YSpanIndices[numSetupIndices].Y = y; + numSetupIndices++; + } + } + + //printf("polygon min max %d %d | %d %d\n", RenderPolygons[i].XMin, RenderPolygons[i].XMinY, RenderPolygons[i].XMax, RenderPolygons[i].XMaxY); + } + + /*for (u32 i = 0; i < RenderNumPolygons; i++) + { + if (RenderPolygons[i].Variant >= numVariants) + { + printf("blarb2 %d %d %d\n", RenderPolygons[i].Variant, i, RenderNumPolygons); + } + //assert(RenderPolygons[i].Variant < numVariants); + }*/ + + if (numYSpans > 0) + { + glBindBuffer(GL_SHADER_STORAGE_BUFFER, YSpanSetupMemory); + glBufferSubData(GL_SHADER_STORAGE_BUFFER, 0, sizeof(SpanSetupY)*numYSpans, YSpanSetups); + + glBindBuffer(GL_TEXTURE_BUFFER, YSpanIndicesTextureMemory); + glBufferSubData(GL_TEXTURE_BUFFER, 0, numSetupIndices*4*2, YSpanIndices.data()); + + glBindBuffer(GL_SHADER_STORAGE_BUFFER, RenderPolygonMemory); + glBufferSubData(GL_SHADER_STORAGE_BUFFER, 0, gpu.GPU3D.RenderNumPolygons*sizeof(RenderPolygon), RenderPolygons); + // we haven't accessed image data yet, so we don't need to invalidate anything + } + + //printf("found via %d %d %d of %d\n", foundviatexcache, foundviaprev, numslow, RenderNumPolygons); + + // bind everything + glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 0, RenderPolygonMemory); + + glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 1, XSpanSetupMemory); + glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 2, YSpanSetupMemory); + + glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 5, FinalTileMemory); + glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 6, BinResultMemory); + glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 7, WorkDescMemory); + + MetaUniform meta; + meta.DispCnt = gpu.GPU3D.RenderDispCnt; + meta.NumPolygons = gpu.GPU3D.RenderNumPolygons; + meta.NumVariants = numVariants; + meta.AlphaRef = gpu.GPU3D.RenderAlphaRef; + { + u32 r = (gpu.GPU3D.RenderClearAttr1 << 1) & 0x3E; if (r) r++; + u32 g = (gpu.GPU3D.RenderClearAttr1 >> 4) & 0x3E; if (g) g++; + u32 b = (gpu.GPU3D.RenderClearAttr1 >> 9) & 0x3E; if (b) b++; + u32 a = (gpu.GPU3D.RenderClearAttr1 >> 16) & 0x1F; + meta.ClearColor = r | (g << 8) | (b << 16) | (a << 24); + meta.ClearDepth = ((gpu.GPU3D.RenderClearAttr2 & 0x7FFF) * 0x200) + 0x1FF; + meta.ClearAttr = gpu.GPU3D.RenderClearAttr1 & 0x3F008000; + } + for (u32 i = 0; i < 32; i++) + { + u32 color = gpu.GPU3D.RenderToonTable[i]; + u32 r = (color << 1) & 0x3E; + u32 g = (color >> 4) & 0x3E; + u32 b = (color >> 9) & 0x3E; + if (r) r++; + if (g) g++; + if (b) b++; + + meta.ToonTable[i*4+0] = r | (g << 8) | (b << 16); + } + for (u32 i = 0; i < 34; i++) + { + meta.ToonTable[i*4+1] = gpu.GPU3D.RenderFogDensityTable[i]; + } + for (u32 i = 0; i < 8; i++) + { + u32 color = gpu.GPU3D.RenderEdgeTable[i]; + u32 r = (color << 1) & 0x3E; + u32 g = (color >> 4) & 0x3E; + u32 b = (color >> 9) & 0x3E; + if (r) r++; + if (g) g++; + if (b) b++; + + meta.ToonTable[i*4+2] = r | (g << 8) | (b << 16); + } + meta.FogOffset = gpu.GPU3D.RenderFogOffset; + meta.FogShift = gpu.GPU3D.RenderFogShift; + { + u32 fogR = (gpu.GPU3D.RenderFogColor << 1) & 0x3E; if (fogR) fogR++; + u32 fogG = (gpu.GPU3D.RenderFogColor >> 4) & 0x3E; if (fogG) fogG++; + u32 fogB = (gpu.GPU3D.RenderFogColor >> 9) & 0x3E; if (fogB) fogB++; + u32 fogA = (gpu.GPU3D.RenderFogColor >> 16) & 0x1F; + meta.FogColor = fogR | (fogG << 8) | (fogB << 16) | (fogA << 24); + } + + glBindBuffer(GL_UNIFORM_BUFFER, MetaUniformMemory); + glBufferSubData(GL_UNIFORM_BUFFER, 0, sizeof(MetaUniform), &meta); + glBindBufferBase(GL_UNIFORM_BUFFER, 0, MetaUniformMemory); + + glUseProgram(ShaderClearCoarseBinMask); + glDispatchCompute(TilesPerLine*TileLines/32, 1, 1); + + bool wbuffer = false; + if (numYSpans > 0) + { + wbuffer = gpu.GPU3D.RenderPolygonRAM[0]->WBuffer; + + glUseProgram(ShaderClearIndirectWorkCount); + glDispatchCompute((numVariants+31)/32, 1, 1); + + // calculate x-spans + glBindImageTexture(0, YSpanIndicesTexture, 0, GL_FALSE, 0, GL_READ_ONLY, GL_RGBA16UI); + glUseProgram(ShaderInterpXSpans[wbuffer]); + glDispatchCompute((numSetupIndices + 31) / 32, 1, 1); + glMemoryBarrier(GL_SHADER_STORAGE_BUFFER); + + // bin polygons + glUseProgram(ShaderBinCombined); + glDispatchCompute(((gpu.GPU3D.RenderNumPolygons + 31) / 32), ScreenWidth/CoarseTileW, ScreenHeight/CoarseTileH); + glMemoryBarrier(GL_SHADER_STORAGE_BUFFER); + + // calculate list offsets + glUseProgram(ShaderCalculateWorkListOffset); + glDispatchCompute((numVariants + 31) / 32, 1, 1); + glMemoryBarrier(GL_SHADER_STORAGE_BUFFER); + + // sort shader work + glUseProgram(ShaderSortWork); + glBindBuffer(GL_DISPATCH_INDIRECT_BUFFER, BinResultMemory); + glDispatchComputeIndirect(offsetof(BinResultHeader, SortWorkWorkCount)); + glMemoryBarrier(GL_SHADER_STORAGE_BUFFER); + + glActiveTexture(GL_TEXTURE0); + + for (int i = 0; i < tilememoryLayer_Num; i++) + glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 2+i, TileMemory[i]); + + // rasterise + { + bool highLightMode = gpu.GPU3D.RenderDispCnt & (1<<1); + + GLuint shadersNoTexture[] = + { + ShaderRasteriseNoTexture[wbuffer], + ShaderRasteriseNoTexture[wbuffer], + highLightMode + ? ShaderRasteriseNoTextureHighlight[wbuffer] + : ShaderRasteriseNoTextureToon[wbuffer], + ShaderRasteriseNoTexture[wbuffer], + ShaderRasteriseShadowMask[wbuffer] + }; + GLuint shadersUseTexture[] = + { + ShaderRasteriseUseTextureModulate[wbuffer], + ShaderRasteriseUseTextureDecal[wbuffer], + highLightMode + ? ShaderRasteriseUseTextureHighlight[wbuffer] + : ShaderRasteriseUseTextureToon[wbuffer], + ShaderRasteriseUseTextureDecal[wbuffer], + ShaderRasteriseShadowMask[wbuffer] + }; + + GLuint prevShader = 0; + s32 prevTexture = 0, prevSampler = 0; + for (int i = 0; i < numVariants; i++) + { + GLuint shader = 0; + if (variants[i].Texture == 0) + { + shader = shadersNoTexture[variants[i].BlendMode]; + } + else + { + shader = shadersUseTexture[variants[i].BlendMode]; + if (variants[i].Texture != prevTexture) + { + glBindTexture(GL_TEXTURE_2D_ARRAY, variants[i].Texture); + prevTexture = variants[i].Texture; + } + if (variants[i].Sampler != prevSampler) + { + glBindSampler(0, variants[i].Sampler); + prevSampler = variants[i].Sampler; + } + } + assert(shader != 0); + if (shader != prevShader) + { + glUseProgram(shader); + prevShader = shader; + } + + glUniform1ui(UniformIdxCurVariant, i); + glUniform2f(UniformIdxTextureSize, 1.f / variants[i].Width, 1.f / variants[i].Height); + glBindBuffer(GL_DISPATCH_INDIRECT_BUFFER, BinResultMemory); + glDispatchComputeIndirect(offsetof(BinResultHeader, VariantWorkCount) + i*4*4); + } + } + } + glMemoryBarrier(GL_SHADER_STORAGE_BARRIER_BIT); + + // compose final image + glUseProgram(ShaderDepthBlend[wbuffer]); + glDispatchCompute(ScreenWidth/TileSize, ScreenHeight/TileSize, 1); + glMemoryBarrier(GL_SHADER_STORAGE_BARRIER_BIT); + + glBindImageTexture(0, Framebuffer, 0, GL_FALSE, 0, GL_WRITE_ONLY, GL_RGBA8); + glBindImageTexture(1, LowResFramebuffer, 0, GL_FALSE, 0, GL_WRITE_ONLY, GL_RGBA8UI); + u32 finalPassShader = 0; + if (gpu.GPU3D.RenderDispCnt & (1<<4)) + finalPassShader |= 0x4; + if (gpu.GPU3D.RenderDispCnt & (1<<7)) + finalPassShader |= 0x2; + if (gpu.GPU3D.RenderDispCnt & (1<<5)) + finalPassShader |= 0x1; + + glUseProgram(ShaderFinalPass[finalPassShader]); + glDispatchCompute(ScreenWidth/32, ScreenHeight, 1); + glMemoryBarrier(GL_SHADER_IMAGE_ACCESS_BARRIER_BIT); + + glBindSampler(0, 0); + + /*u64 starttime = armGetSystemTick(); + EmuQueue.waitIdle(); + printf("total time %f\n", armTicksToNs(armGetSystemTick()-starttime)*0.000001f);*/ + + /*for (u32 i = 0; i < RenderNumPolygons; i++) + { + if (RenderPolygons[i].Variant >= numVariants) + { + printf("blarb %d %d %d\n", RenderPolygons[i].Variant, i, RenderNumPolygons); + } + //assert(RenderPolygons[i].Variant < numVariants); + }*/ + + /*for (int i = 0; i < binresult->SortWorkWorkCount[0]*32; i++) + { + printf("sorted %x %x\n", binresult->SortedWork[i*2+0], binresult->SortedWork[i*2+1]); + }*/ +/* if (polygonvisible != -1) + { + SpanSetupX* xspans = Gfx::DataHeap->CpuAddr(XSpanSetupMemory); + printf("span result\n"); + Polygon* poly = RenderPolygonRAM[polygonvisible]; + u32 xspanoffset = RenderPolygons[polygonvisible].FirstXSpan; + for (u32 i = 0; i < (poly->YBottom - poly->YTop); i++) + { + printf("%d: %d - %d | %d %d | %d %d\n", i + poly->YTop, xspans[xspanoffset + i].X0, xspans[xspanoffset + i].X1, xspans[xspanoffset + i].__pad0, xspans[xspanoffset + i].__pad1, RenderPolygons[polygonvisible].YTop, RenderPolygons[polygonvisible].YBot); + } + }*/ +/* + printf("xspans: %d\n", numSetupIndices); + SpanSetupX* xspans = Gfx::DataHeap->CpuAddr(XSpanSetupMemory[curSlice]); + for (int i = 0; i < numSetupIndices; i++) + { + printf("poly %d %d %d | line %d | %d to %d\n", YSpanIndices[i].PolyIdx, YSpanIndices[i].SpanIdxL, YSpanIndices[i].SpanIdxR, YSpanIndices[i].Y, xspans[i].X0, xspans[i].X1); + } + printf("bin result\n"); + BinResult* binresult = Gfx::DataHeap->CpuAddr(BinResultMemory); + for (u32 y = 0; y < 192/8; y++) + { + for (u32 x = 0; x < 256/8; x++) + { + printf("%08x ", binresult->BinnedMaskCoarse[(x + y * (256/8)) * 2]); + } + printf("\n"); + }*/ +} + +void ComputeRenderer::RestartFrame(GPU& gpu) +{ + +} + +u32* ComputeRenderer::GetLine(int line) +{ + int stride = 256; + + if (line == 0) + { + glBindBuffer(GL_PIXEL_PACK_BUFFER, PixelBuffer); + u8* data = (u8*)glMapBuffer(GL_PIXEL_PACK_BUFFER, GL_READ_ONLY); + if (data) memcpy(&FramebufferCPU[0], data, 4*stride*192); + glUnmapBuffer(GL_PIXEL_PACK_BUFFER); + } + + return &FramebufferCPU[stride * line]; +} + +void ComputeRenderer::SetupAccelFrame() +{ + glBindTexture(GL_TEXTURE_2D, Framebuffer); +} + +void ComputeRenderer::PrepareCaptureFrame() +{ + glBindBuffer(GL_PIXEL_PACK_BUFFER, PixelBuffer); + glBindTexture(GL_TEXTURE_2D, LowResFramebuffer); + glGetTexImage(GL_TEXTURE_2D, 0, GL_RGBA_INTEGER, GL_UNSIGNED_BYTE, nullptr); +} + +void ComputeRenderer::BindOutputTexture(int buffer) +{ + CurGLCompositor.BindOutputTexture(buffer); +} + +void ComputeRenderer::Blit(const GPU &gpu) +{ + CurGLCompositor.RenderFrame(gpu, *this); +} + +void ComputeRenderer::Stop(const GPU &gpu) +{ + CurGLCompositor.Stop(gpu); +} + +} \ No newline at end of file diff --git a/src/GPU3D_Compute.h b/src/GPU3D_Compute.h new file mode 100644 index 0000000000..7544c09e0f --- /dev/null +++ b/src/GPU3D_Compute.h @@ -0,0 +1,242 @@ +/* + Copyright 2016-2022 melonDS team + + This file is part of melonDS. + + melonDS is free software: you can redistribute it and/or modify it under + the terms of the GNU General Public License as published by the Free + Software Foundation, either version 3 of the License, or (at your option) + any later version. + + melonDS is distributed in the hope that it will be useful, but WITHOUT ANY + WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with melonDS. If not, see http://www.gnu.org/licenses/. +*/ + +#ifndef GPU3D_COMPUTE +#define GPU3D_COMPUTE + +#include + +#include "types.h" + +#include "GPU3D.h" + +#include "OpenGLSupport.h" +#include "GPU_OpenGL.h" + +#include "GPU3D_TexcacheOpenGL.h" + +#include "NonStupidBitfield.h" + +namespace melonDS +{ + +class ComputeRenderer : public Renderer3D +{ +public: + static std::unique_ptr New(); + ~ComputeRenderer() override; + + void Reset(GPU& gpu) override; + + void SetRenderSettings(int scale, bool highResolutionCoordinates); + + void VCount144(GPU& gpu) override; + + void RenderFrame(GPU& gpu) override; + void RestartFrame(GPU& gpu) override; + u32* GetLine(int line) override; + + void SetupAccelFrame() override; + void PrepareCaptureFrame() override; + + void BindOutputTexture(int buffer) override; + + void Blit(const GPU& gpu) override; + void Stop(const GPU& gpu) override; + + bool NeedsShaderCompile() { return ShaderStepIdx != 33; } + void ShaderCompileStep(int& current, int& count) override; +private: + ComputeRenderer(GLCompositor&& compositor); + + GLuint ShaderInterpXSpans[2]; + GLuint ShaderBinCombined; + GLuint ShaderDepthBlend[2]; + GLuint ShaderRasteriseNoTexture[2]; + GLuint ShaderRasteriseNoTextureToon[2]; + GLuint ShaderRasteriseNoTextureHighlight[2]; + GLuint ShaderRasteriseUseTextureDecal[2]; + GLuint ShaderRasteriseUseTextureModulate[2]; + GLuint ShaderRasteriseUseTextureToon[2]; + GLuint ShaderRasteriseUseTextureHighlight[2]; + GLuint ShaderRasteriseShadowMask[2]; + GLuint ShaderClearCoarseBinMask; + GLuint ShaderClearIndirectWorkCount; + GLuint ShaderCalculateWorkListOffset; + GLuint ShaderSortWork; + GLuint ShaderFinalPass[8]; + + GLuint YSpanIndicesTextureMemory; + GLuint YSpanIndicesTexture; + GLuint YSpanSetupMemory; + GLuint XSpanSetupMemory; + GLuint BinResultMemory; + GLuint RenderPolygonMemory; + GLuint WorkDescMemory; + + enum + { + tilememoryLayer_Color, + tilememoryLayer_Depth, + tilememoryLayer_Attr, + tilememoryLayer_Num, + }; + + GLuint TileMemory[tilememoryLayer_Num]; + GLuint FinalTileMemory; + + u32 DummyLine[256] = {}; + + struct SpanSetupY + { + // Attributes + s32 Z0, Z1, W0, W1; + s32 ColorR0, ColorG0, ColorB0; + s32 ColorR1, ColorG1, ColorB1; + s32 TexcoordU0, TexcoordV0; + s32 TexcoordU1, TexcoordV1; + + // Interpolator + s32 I0, I1; + s32 Linear; + s32 IRecip; + s32 W0n, W0d, W1d; + + // Slope + s32 Increment; + + s32 X0, X1, Y0, Y1; + s32 XMin, XMax; + s32 DxInitial; + + s32 XCovIncr; + u32 IsDummy; + }; + struct SpanSetupX + { + s32 X0, X1; + + s32 EdgeLenL, EdgeLenR, EdgeCovL, EdgeCovR; + + s32 XRecip; + + u32 Flags; + + s32 Z0, Z1, W0, W1; + s32 ColorR0, ColorG0, ColorB0; + s32 ColorR1, ColorG1, ColorB1; + s32 TexcoordU0, TexcoordV0; + s32 TexcoordU1, TexcoordV1; + + s32 CovLInitial, CovRInitial; + }; + struct SetupIndices + { + u16 PolyIdx, SpanIdxL, SpanIdxR, Y; + }; + struct RenderPolygon + { + u32 FirstXSpan; + s32 YTop, YBot; + + s32 XMin, XMax; + s32 XMinY, XMaxY; + + u32 Variant; + u32 Attr; + + float TextureLayer; + }; + + static constexpr int TileSize = 8; + static constexpr int CoarseTileCountX = 8; + static constexpr int CoarseTileCountY = 4; + static constexpr int CoarseTileW = CoarseTileCountX * TileSize; + static constexpr int CoarseTileH = CoarseTileCountY * TileSize; + + static constexpr int BinStride = 2048/32; + static constexpr int CoarseBinStride = BinStride/32; + + static constexpr int MaxVariants = 256; + + static constexpr int UniformIdxCurVariant = 0; + static constexpr int UniformIdxTextureSize = 1; + + static constexpr int MaxFullscreenLayers = 16; + + struct BinResultHeader + { + u32 VariantWorkCount[MaxVariants*4]; + u32 SortedWorkOffset[MaxVariants]; + + u32 SortWorkWorkCount[4]; + }; + + static const int MaxYSpanSetups = 6144*2; + std::vector YSpanIndices; + SpanSetupY YSpanSetups[MaxYSpanSetups]; + RenderPolygon RenderPolygons[2048]; + + TexcacheOpenGL Texcache; + + struct MetaUniform + { + u32 NumPolygons; + u32 NumVariants; + + u32 AlphaRef; + u32 DispCnt; + + u32 ToonTable[4*34]; + + u32 ClearColor, ClearDepth, ClearAttr; + + u32 FogOffset, FogShift, FogColor; + }; + GLuint MetaUniformMemory; + + GLuint Samplers[9]; + + GLuint Framebuffer = 0; + GLuint LowResFramebuffer; + GLuint PixelBuffer; + + u32 FramebufferCPU[256*192]; + + int ScreenWidth, ScreenHeight; + int TilesPerLine, TileLines; + int ScaleFactor = -1; + int MaxWorkTiles; + bool HiresCoordinates; + + GLCompositor CurGLCompositor; + + int ShaderStepIdx = 0; + + void DeleteShaders(); + + void SetupAttrs(SpanSetupY* span, Polygon* poly, int from, int to); + void SetupYSpan(RenderPolygon* rp, SpanSetupY* span, Polygon* poly, int from, int to, int side, s32 positions[10][2]); + void SetupYSpanDummy(RenderPolygon* rp, SpanSetupY* span, Polygon* poly, int vertex, int side, s32 positions[10][2]); + + bool CompileShader(GLuint& shader, const std::string& source, const std::initializer_list& defines); +}; + +} + +#endif \ No newline at end of file diff --git a/src/GPU3D_Compute_shaders.h b/src/GPU3D_Compute_shaders.h new file mode 100644 index 0000000000..d365cf3100 --- /dev/null +++ b/src/GPU3D_Compute_shaders.h @@ -0,0 +1,1665 @@ +/* + Copyright 2016-2022 melonDS team + + This file is part of melonDS. + + melonDS is free software: you can redistribute it and/or modify it under + the terms of the GNU General Public License as published by the Free + Software Foundation, either version 3 of the License, or (at your option) + any later version. + + melonDS is distributed in the hope that it will be useful, but WITHOUT ANY + WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with melonDS. If not, see http://www.gnu.org/licenses/. +*/ + +#ifndef GPU3D_COMPUTE_SHADERS +#define GPU3D_COMPUTE_SHADERS + +#include + +namespace melonDS +{ + +namespace ComputeRendererShaders +{ + +// defines: +// InterpSpans +// BinCombined +// Rasterise +// DepthBlend +// ClearCoarseBinMask +// ClearIndirectWorkCount +// CalculateWorkOffsets +// SortWork +// FinalPass + +// AntiAliasing +// EdgeMarking +// Fog + +// ZBuffer +// WBuffer + +// for Rasterise +// NoTexture +// UseTexture +// Decal +// Modulate +// Toon +// Highlight +// ShadowMask + + +/* + Some notes on signed division: + + we want to avoid it, so we can avoid higher precision numbers + in a few places. + + Fortunately all divisions *should* assuming I'm not mistaken + have the same sign on the divisor and the dividend. + + Thus we apply: + + assuming n < 0 <=> d < 0 + n/d = abs(n)/abs(d) + +*/ + +const std::string XSpanSetupBuffer{R"( + +const uint XSpanSetup_Linear = 1U << 0; +const uint XSpanSetup_FillInside = 1U << 1; +const uint XSpanSetup_FillLeft = 1U << 2; +const uint XSpanSetup_FillRight = 1U << 3; + +struct XSpanSetup +{ + int X0, X1; + + int InsideStart, InsideEnd, EdgeCovL, EdgeCovR; + + int XRecip; + + uint Flags; + + int Z0, Z1, W0, W1; + int ColorR0, ColorG0, ColorB0; + int ColorR1, ColorG1, ColorB1; + int TexcoordU0, TexcoordV0; + int TexcoordU1, TexcoordV1; + + int CovLInitial, CovRInitial; +}; + +#if defined(Rasterise) +int CalcYFactorX(XSpanSetup span, int x) +{ + x -= span.X0; + + if (span.X0 != span.X1) + { + uint numLo = uint(x) * uint(span.W0); + uint numHi = 0U; + numHi |= numLo >> (32U-YFactorShift); + numLo <<= YFactorShift; + + uint den = uint(x) * uint(span.W0) + uint(span.X1 - span.X0 - x) * uint(span.W1); + + if (den == 0) + return 0; + else + return int(Div64_32_32(numHi, numLo, den)); + } + else + { + return 0; + } +} +#endif + +layout (std430, binding = 1) buffer XSpanSetupsBuffer +{ + XSpanSetup XSpanSetups[]; +}; + +)"}; + +const std::string YSpanSetupBuffer{R"( + +struct YSpanSetup +{ + // Attributes + int Z0, Z1, W0, W1; + int ColorR0, ColorG0, ColorB0; + int ColorR1, ColorG1, ColorB1; + int TexcoordU0, TexcoordV0; + int TexcoordU1, TexcoordV1; + + // Interpolator + int I0, I1; + bool Linear; + int IRecip; + int W0n, W0d, W1d; + + // Slope + int Increment; + + int X0, X1, Y0, Y1; + int XMin, XMax; + int DxInitial; + + int XCovIncr; + + bool IsDummy; +}; + +#if defined(InterpSpans) +int CalcYFactorY(YSpanSetup span, int i) +{ + /* + maybe it would be better to do use a 32x32=64 multiplication? + */ + uint numLo = uint(abs(i)) * uint(span.W0n); + uint numHi = 0U; + numHi |= numLo >> (32U-YFactorShift); + numLo <<= YFactorShift; + + uint den = uint(abs(i)) * uint(span.W0d) + uint(abs(span.I1 - span.I0 - i)) * span.W1d; + + if (den == 0) + { + return 0; + } + else + { + return int(Div64_32_32(numHi, numLo, den)); + } +} + +int CalculateDx(int y, YSpanSetup span) +{ + return span.DxInitial + (y - span.Y0) * span.Increment; +} + +int CalculateX(int dx, YSpanSetup span) +{ + int x = span.X0; + if (span.X1 < span.X0) + x -= dx >> 18; + else + x += dx >> 18; + return clamp(x, span.XMin, span.XMax); +} + +void EdgeParams_XMajor(bool side, int dx, YSpanSetup span, out int edgelen, out int edgecov) +{ + bool negative = span.X1 < span.X0; + int len; + if (side != negative) + len = (dx >> 18) - ((dx-span.Increment) >> 18); + else + len = ((dx+span.Increment) >> 18) - (dx >> 18); + edgelen = len; + + int xlen = span.XMax + 1 - span.XMin; + int startx = dx >> 18; + if (negative) startx = xlen - startx; + if (side) startx = startx - len + 1; + + uint r; + int startcov = int(Div(uint(((startx << 10) + 0x1FF) * (span.Y1 - span.Y0)), uint(xlen), r)); + edgecov = (1<<31) | ((startcov & 0x3FF) << 12) | (span.XCovIncr & 0x3FF); +} + +void EdgeParams_YMajor(bool side, int dx, YSpanSetup span, out int edgelen, out int edgecov) +{ + bool negative = span.X1 < span.X0; + edgelen = 1; + + if (span.Increment == 0) + { + edgecov = 31; + } + else + { + int cov = ((dx >> 9) + (span.Increment >> 10)) >> 4; + if ((cov >> 5) != (dx >> 18)) cov = 31; + cov &= 0x1F; + if (side == negative) cov = 0x1F - cov; + + edgecov = cov; + } +} +#endif + +layout (std430, binding = 2) buffer YSpanSetupsBuffer +{ + YSpanSetup YSpanSetups[]; +}; + +)"}; + +const std::string PolygonBuffer{R"( +struct Polygon +{ + int FirstXSpan; + int YTop, YBot; + + int XMin, XMax; + int XMinY, XMaxY; + + int Variant; + + uint Attr; + + float TextureLayer; +}; + +layout (std430, binding = 0) readonly buffer PolygonBuffer +{ + Polygon Polygons[]; +}; +)"}; + +const std::string BinningBuffer{R"( + +layout (std430, binding = 6) buffer BinResultBuffer +{ + uvec4 VariantWorkCount[MaxVariants]; + uint SortedWorkOffset[MaxVariants]; + + uvec4 SortWorkWorkCount; + + uint BinningMaskAndOffset[]; + //uint BinnedMaskCoarse[TilesPerLine*TileLines*CoarseBinStride]; + //uint BinnedMask[TilesPerLine*TileLines*BinStride]; + //uint WorkOffsets[TilesPerLine*TileLines*BinStride]; +}; + +const int BinningCoarseMaskStart = 0; +const int BinningMaskStart = BinningCoarseMaskStart+TilesPerLine*TileLines*CoarseBinStride; +const int BinningWorkOffsetsStart = BinningMaskStart+TilesPerLine*TileLines*BinStride; + +)"}; + +/* + structure of each WorkDesc item: + x: + bits 0-10: polygon idx + bits 11-31: tile idx (before sorting within variant after sorting within all tiles) + y: + bits 0-15: X position on screen + bits 15-31: Y position on screen +*/ +const std::string WorkDescBuffer{R"( +layout (std430, binding = 7) buffer WorkDescBuffer +{ + //uvec2 UnsortedWorkDescs[MaxWorkTiles]; + //uvec2 SortedWorkDescs[MaxWorkTiles]; + uvec2 WorkDescs[]; +}; + +const uint WorkDescsUnsortedStart = 0; +const uint WorkDescsSortedStart = WorkDescsUnsortedStart+MaxWorkTiles; + +)"}; + +const std::string Tilebuffers{R"( +layout (std430, binding = 2) buffer ColorTileBuffer +{ + uint ColorTiles[]; +}; +layout (std430, binding = 3) buffer DepthTileBuffer +{ + uint DepthTiles[]; +}; +layout (std430, binding = 4) buffer AttrTileBuffer +{ + uint AttrTiles[]; +}; + +)"}; + +const std::string ResultBuffer{R"( +layout (std430, binding = 5) buffer ResultBuffer +{ + uint ResultValue[]; +}; + +const uint ResultColorStart = 0; +const uint ResultDepthStart = ResultColorStart+ScreenWidth*ScreenHeight*2; +const uint ResultAttrStart = ResultDepthStart+ScreenWidth*ScreenHeight*2; +)"}; + +const char* Common = R"( + +#define TileSize 8 +const int CoarseTileCountX = 8; +const int CoarseTileCountY = 4; +const int CoarseTileW = (CoarseTileCountX * TileSize); +const int CoarseTileH = (CoarseTileCountY * TileSize); + +const int FramebufferStride = ScreenWidth*ScreenHeight; +const int TilesPerLine = ScreenWidth/TileSize; +const int TileLines = ScreenHeight/TileSize; + +const int BinStride = 2048/32; +const int CoarseBinStride = BinStride/32; + +const int MaxVariants = 256; + +layout (std140, binding = 0) uniform MetaUniform +{ + uint NumPolygons; + uint NumVariants; + + int AlphaRef; + + uint DispCnt; + + // r = Toon + // g = Fog Density + // b = Edge Color + uvec4 ToonTable[34]; + + uint ClearColor, ClearDepth, ClearAttr; + + uint FogOffset, FogShift, FogColor; +}; + +#ifdef InterpSpans +const int YFactorShift = 9; +#else +const int YFactorShift = 8; +#endif + +#if defined(InterpSpans) || defined(Rasterise) +uint Umulh(uint a, uint b) +{ + uint lo, hi; + umulExtended(a, b, hi, lo); + return hi; +} + +const uint startTable[256] = uint[256]( + 254, 252, 250, 248, 246, 244, 242, 240, 238, 236, 234, 233, 231, 229, 227, 225, 224, 222, 220, 218, 217, 215, 213, 212, 210, 208, 207, 205, 203, 202, 200, 199, 197, 195, 194, 192, 191, 189, 188, 186, 185, 183, 182, 180, 179, 178, 176, 175, 173, 172, 170, 169, 168, 166, 165, 164, 162, 161, 160, 158, +157, 156, 154, 153, 152, 151, 149, 148, 147, 146, 144, 143, 142, 141, 139, 138, 137, 136, 135, 134, 132, 131, 130, 129, 128, 127, 126, 125, 123, 122, 121, 120, 119, 118, 117, 116, 115, 114, 113, 112, 111, 110, 109, 108, 107, 106, 105, 104, 103, 102, 101, 100, 99, 98, 97, 96, 95, 94, 93, 92, 91, 90, 89, 88, 88, 87, 86, 85, 84, 83, 82, 81, 80, 80, 79, 78, 77, 76, 75, 74, 74, 73, 72, 71, 70, 70, 69, 68, 67, 66, 66, 65, 64, 63, 62, 62, 61, 60, 59, 59, 58, 57, 56, 56, 55, 54, 53, 53, 52, 51, 50, 50, 49, 48, 48, 47, 46, 46, 45, 44, 43, 43, 42, 41, 41, 40, 39, 39, 38, 37, 37, 36, 35, 35, 34, 33, 33, 32, 32, 31, 30, 30, 29, 28, 28, 27, 27, 26, 25, 25, 24, 24, 23, 22, 22, 21, 21, 20, 19, 19, 18, 18, 17, 17, 16, 15, 15, 14, 14, 13, 13, 12, 12, 11, 10, 10, 9, 9, 8, 8, 7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1, 1, 0, 0 +); + +uint Div(uint x, uint y, out uint r) +{ + // https://www.microsoft.com/en-us/research/publication/software-integer-division/ + uint k = 31 - findMSB(y); + uint ty = (y << k) >> (32 - 9); + uint t = startTable[ty - 256] + 256; + uint z = (t << (32 - 9)) >> (32 - k - 1); + uint my = 0 - y; + + z += Umulh(z, my * z); + z += Umulh(z, my * z); + + uint q = Umulh(x, z); + r = x - y * q; + if(r >= y) + { + r = r - y; + q = q + 1; + if(r >= y) + { + r = r - y; + q = q + 1; + } + } + + return q; +} + +uint Div64_32_32(uint numHi, uint numLo, uint den) +{ + // based on https://github.com/ridiculousfish/libdivide/blob/3bd34388573681ce563348cdf04fe15d24770d04/libdivide.h#L469 + // modified to work with half the size 64/32=32 instead of 128/64=64 + // for further details see https://ridiculousfish.com/blog/posts/labor-of-division-episode-iv.html + + // We work in base 2**16. + // A uint32 holds a single digit (in the lower 16 bit). A uint32 holds two digits. + // Our numerator is conceptually [num3, num2, num1, num0]. + // Our denominator is [den1, den0]. + const uint b = (1U << 16); + + // Determine the normalization factor. We multiply den by this, so that its leading digit is at + // least half b. In binary this means just shifting left by the number of leading zeros, so that + // there's a 1 in the MSB. + // We also shift numer by the same amount. This cannot overflow because numHi < den. + // The expression (-shift & 63) is the same as (64 - shift), except it avoids the UB of shifting + // by 64. (it's also UB in GLSL!!!!) + uint shift = 31 - findMSB(den); + den <<= shift; + numHi <<= shift; + numHi |= (numLo >> (-shift & 31U)) & uint(-int(shift) >> 31); + numLo <<= shift; + + // Extract the low digits of the numerator and both digits of the denominator. + uint num1 = (numLo >> 16); + uint num0 = (numLo & 0xFFFFU); + uint den1 = (den >> 16); + uint den0 = (den & 0xFFFFU); + + // We wish to compute q1 = [n3 n2 n1] / [d1 d0]. + // Estimate q1 as [n3 n2] / [d1], and then correct it. + // Note while qhat may be 2 digits, q1 is always 1 digit. + + uint rhat; + uint qhat = Div(numHi, den1, rhat); + uint c1 = qhat * den0; + uint c2 = rhat * b + num1; + if (c1 > c2) qhat -= (c1 - c2 > den) ? 2 : 1; + uint q1 = qhat & 0xFFFFU; + + // Compute the true (partial) remainder. + uint rem = numHi * b + num1 - q1 * den; + + // We wish to compute q0 = [rem1 rem0 n0] / [d1 d0]. + // Estimate q0 as [rem1 rem0] / [d1] and correct it. + qhat = Div(rem, den1, rhat); + c1 = qhat * den0; + c2 = rhat * b + num0; + if (c1 > c2) qhat -= (c1 - c2 > den) ? 2 : 1; + + return bitfieldInsert(qhat, q1, 16, 16); +} + +int InterpolateAttrPersp(int y0, int y1, int ifactor) +{ + if (y0 == y1) + return y0; + + if (y0 < y1) + return y0 + (((y1-y0) * ifactor) >> YFactorShift); + else + return y1 + (((y0-y1) * ((1<> YFactorShift); +} + +int InterpolateAttrLinear(int y0, int y1, int i, int irecip, int idiff) +{ + if (y0 == y1) + return y0; + +#ifndef Rasterise + irecip = abs(irecip); +#endif + + uint mulLo, mulHi, carry; + if (y0 < y1) + { +#ifndef Rasterise + uint offset = uint(abs(i)); +#else + uint offset = uint(i); +#endif + umulExtended(uint(y1-y0)*offset, uint(irecip), mulHi, mulLo); + mulLo = uaddCarry(mulLo, 3U<<24, carry); + mulHi += carry; + return y0 + int((mulLo >> 30) | (mulHi << (32 - 30))); + //return y0 + int(((int64_t(y1-y0) * int64_t(offset) * int64_t(irecip)) + int64_t(3<<24)) >> 30); + } + else + { +#ifndef Rasterise + uint offset = uint(abs(idiff-i)); +#else + uint offset = uint(idiff-i); +#endif + umulExtended(uint(y0-y1)*offset, uint(irecip), mulHi, mulLo); + mulLo = uaddCarry(mulLo, 3<<24, carry); + mulHi += carry; + return y1 + int((mulLo >> 30) | (mulHi << (32 - 30))); + //return y1 + int(((int64_t(y0-y1) * int64_t(offset) * int64_t(irecip)) + int64_t(3<<24)) >> 30); + } +} + +uint InterpolateZZBuffer(int z0, int z1, int i, int irecip, int idiff) +{ + if (z0 == z1) + return z0; + + uint base, disp, factor; + if (z0 < z1) + { + base = uint(z0); + disp = uint(z1 - z0); + factor = uint(abs(i)); + } + else + { + base = uint(z1); + disp = uint(z0 - z1), + factor = uint(abs(idiff - i)); + } + +#ifdef InterpSpans + int shiftl = 0; + const int shiftr = 22; + if (disp > 0x3FF) + { + shiftl = findMSB(disp) - 9; + disp >>= shiftl; + } +#else + disp >>= 9; + const int shiftl = 0; + const int shiftr = 13; +#endif + uint mulLo, mulHi; + + umulExtended(disp * factor, abs(irecip) >> 8, mulHi, mulLo); + + return base + (((mulLo >> shiftr) | (mulHi << (32 - shiftr))) << shiftl); +/* + int base, disp, factor; + if (z0 < z1) + { + base = z0; + disp = z1 - z0; + factor = i; + } + else + { + base = z1; + disp = z0 - z1, + factor = idiff - i; + } + +#ifdef InterpSpans + { + int shift = 0; + while (disp > 0x3FF) + { + disp >>= 1; + shift++; + } + + return base + int(((int64_t(disp) * int64_t(factor) * (int64_t(irecip) >> 8)) >> 22) << shift); + } +#else + { + disp >>= 9; + return base + int((int64_t(disp) * int64_t(factor) * (int64_t(irecip) >> 8)) >> 13); + } +#endif*/ +} + +uint InterpolateZWBuffer(int z0, int z1, int ifactor) +{ + if (z0 == z1) + return z0; + +#ifdef Rasterise + // since the precision along x spans is only 8 bit the result will always fit in 32-bit + if (z0 < z1) + { + return uint(z0) + (((z1-z0) * ifactor) >> YFactorShift); + } + else + { + return uint(z1) + (((z0-z1) * ((1<> YFactorShift); + } +#else + uint mulLo, mulHi; + if (z0 < z1) + { + umulExtended(z1-z0, ifactor, mulHi, mulLo); + // 64-bit shift + return uint(z0) + ((mulLo >> YFactorShift) | (mulHi << (32-YFactorShift))); + } + else + { + umulExtended(z0-z1, (1<> YFactorShift) | (mulHi << (32-YFactorShift))); + } +#endif + /*if (z0 < z1) + { + return uint(z0) + uint((int64_t(z1-z0) * int64_t(ifactor)) >> YFactorShift); + } + else + { + return uint(z1) + uint((int64_t(z0-z1) * int64_t((1<> YFactorShift); + }*/ +} +#endif + +)"; + +const std::string InterpSpans = + PolygonBuffer + + XSpanSetupBuffer + + YSpanSetupBuffer + R"( +layout (local_size_x = 32) in; + +layout (binding = 0, rgba16ui) uniform readonly uimageBuffer SetupIndices; + +void main() +{ + uvec4 setup = imageLoad(SetupIndices, int(gl_GlobalInvocationID.x)); + + YSpanSetup spanL = YSpanSetups[setup.y]; + YSpanSetup spanR = YSpanSetups[setup.z]; + + XSpanSetup xspan; + xspan.Flags = 0U; + + int y = int(setup.w); + + int dxl = CalculateDx(y, spanL); + int dxr = CalculateDx(y, spanR); + + int xl = CalculateX(dxl, spanL); + int xr = CalculateX(dxr, spanR); + + Polygon polygon = Polygons[setup.x]; + + int edgeLenL, edgeLenR; + + if (xl > xr) + { + YSpanSetup tmpSpan = spanL; + spanL = spanR; + spanR = tmpSpan; + + int tmp = xl; + xl = xr; + xr = tmp; + + EdgeParams_YMajor(false, dxr, spanL, edgeLenL, xspan.EdgeCovL); + EdgeParams_YMajor(true, dxl, spanR, edgeLenR, xspan.EdgeCovR); + } + else + { + // edges are the right way + if (spanL.Increment > 0x40000) + EdgeParams_XMajor(false, dxl, spanL, edgeLenL, xspan.EdgeCovL); + else + EdgeParams_YMajor(false, dxl, spanL, edgeLenL, xspan.EdgeCovL); + if (spanR.Increment > 0x40000) + EdgeParams_XMajor(true, dxr, spanR, edgeLenR, xspan.EdgeCovR); + else + EdgeParams_YMajor(true, dxr, spanR, edgeLenR, xspan.EdgeCovR); + } + + xspan.CovLInitial = (xspan.EdgeCovL >> 12) & 0x3FF; + if (xspan.CovLInitial == 0x3FF) + xspan.CovLInitial = 0; + xspan.CovRInitial = (xspan.EdgeCovR >> 12) & 0x3FF; + if (xspan.CovRInitial == 0x3FF) + xspan.CovRInitial = 0; + + xspan.X0 = xl; + xspan.X1 = xr + 1; + + uint polyalpha = ((polygon.Attr >> 16) & 0x1FU); + bool isWireframe = polyalpha == 0U; + + if (!isWireframe || (y == polygon.YTop || y == polygon.YBot - 1)) + xspan.Flags |= XSpanSetup_FillInside; + + xspan.InsideStart = xspan.X0 + edgeLenL; + if (xspan.InsideStart > xspan.X1) + xspan.InsideStart = xspan.X1; + xspan.InsideEnd = xspan.X1 - edgeLenR; + if (xspan.InsideEnd > xspan.X1) + xspan.InsideEnd = xspan.X1; + + bool isShadowMask = ((polygon.Attr & 0x3F000030U) == 0x00000030U); + bool fillAllEdges = polyalpha < 31 || (DispCnt & (3U<<4)) != 0U; + + if (fillAllEdges || spanL.X1 < spanL.X0 || spanL.Increment <= 0x40000) + xspan.Flags |= XSpanSetup_FillLeft; + if (fillAllEdges || (spanR.X1 >= spanR.X0 && spanR.Increment > 0x40000) || spanR.Increment == 0) + xspan.Flags |= XSpanSetup_FillRight; + + if (spanL.I0 == spanL.I1) + { + xspan.TexcoordU0 = spanL.TexcoordU0; + xspan.TexcoordV0 = spanL.TexcoordV0; + xspan.ColorR0 = spanL.ColorR0; + xspan.ColorG0 = spanL.ColorG0; + xspan.ColorB0 = spanL.ColorB0; + xspan.Z0 = spanL.Z0; + xspan.W0 = spanL.W0; + } + else + { + int i = (spanL.Increment > 0x40000 ? xl : y) - spanL.I0; + int ifactor = CalcYFactorY(spanL, i); + int idiff = spanL.I1 - spanL.I0; + +#ifdef ZBuffer + xspan.Z0 = int(InterpolateZZBuffer(spanL.Z0, spanL.Z1, i, spanL.IRecip, idiff)); +#endif +#ifdef WBuffer + xspan.Z0 = int(InterpolateZWBuffer(spanL.Z0, spanL.Z1, ifactor)); +#endif + + if (!spanL.Linear) + { + xspan.TexcoordU0 = InterpolateAttrPersp(spanL.TexcoordU0, spanL.TexcoordU1, ifactor); + xspan.TexcoordV0 = InterpolateAttrPersp(spanL.TexcoordV0, spanL.TexcoordV1, ifactor); + + xspan.ColorR0 = InterpolateAttrPersp(spanL.ColorR0, spanL.ColorR1, ifactor); + xspan.ColorG0 = InterpolateAttrPersp(spanL.ColorG0, spanL.ColorG1, ifactor); + xspan.ColorB0 = InterpolateAttrPersp(spanL.ColorB0, spanL.ColorB1, ifactor); + + xspan.W0 = InterpolateAttrPersp(spanL.W0, spanL.W1, ifactor); + } + else + { + xspan.TexcoordU0 = InterpolateAttrLinear(spanL.TexcoordU0, spanL.TexcoordU1, i, spanL.IRecip, idiff); + xspan.TexcoordV0 = InterpolateAttrLinear(spanL.TexcoordV0, spanL.TexcoordV1, i, spanL.IRecip, idiff); + + xspan.ColorR0 = InterpolateAttrLinear(spanL.ColorR0, spanL.ColorR1, i, spanL.IRecip, idiff); + xspan.ColorG0 = InterpolateAttrLinear(spanL.ColorG0, spanL.ColorG1, i, spanL.IRecip, idiff); + xspan.ColorB0 = InterpolateAttrLinear(spanL.ColorB0, spanL.ColorB1, i, spanL.IRecip, idiff); + + xspan.W0 = spanL.W0; // linear mode is only taken if W0 == W1 + } + } + + if (spanR.I0 == spanR.I1) + { + xspan.TexcoordU1 = spanR.TexcoordU0; + xspan.TexcoordV1 = spanR.TexcoordV0; + xspan.ColorR1 = spanR.ColorR0; + xspan.ColorG1 = spanR.ColorG0; + xspan.ColorB1 = spanR.ColorB0; + xspan.Z1 = spanR.Z0; + xspan.W1 = spanR.W0; + } + else + { + int i = (spanR.Increment > 0x40000 ? xr : y) - spanR.I0; + int ifactor = CalcYFactorY(spanR, i); + int idiff = spanR.I1 - spanR.I0; + + #ifdef ZBuffer + xspan.Z1 = int(InterpolateZZBuffer(spanR.Z0, spanR.Z1, i, spanR.IRecip, idiff)); + #endif + #ifdef WBuffer + xspan.Z1 = int(InterpolateZWBuffer(spanR.Z0, spanR.Z1, ifactor)); + #endif + + if (!spanR.Linear) + { + xspan.TexcoordU1 = InterpolateAttrPersp(spanR.TexcoordU0, spanR.TexcoordU1, ifactor); + xspan.TexcoordV1 = InterpolateAttrPersp(spanR.TexcoordV0, spanR.TexcoordV1, ifactor); + + xspan.ColorR1 = InterpolateAttrPersp(spanR.ColorR0, spanR.ColorR1, ifactor); + xspan.ColorG1 = InterpolateAttrPersp(spanR.ColorG0, spanR.ColorG1, ifactor); + xspan.ColorB1 = InterpolateAttrPersp(spanR.ColorB0, spanR.ColorB1, ifactor); + + xspan.W1 = int(InterpolateAttrPersp(spanR.W0, spanR.W1, ifactor)); + } + else + { + xspan.TexcoordU1 = InterpolateAttrLinear(spanR.TexcoordU0, spanR.TexcoordU1, i, spanR.IRecip, idiff); + xspan.TexcoordV1 = InterpolateAttrLinear(spanR.TexcoordV0, spanR.TexcoordV1, i, spanR.IRecip, idiff); + + xspan.ColorR1 = InterpolateAttrLinear(spanR.ColorR0, spanR.ColorR1, i, spanR.IRecip, idiff); + xspan.ColorG1 = InterpolateAttrLinear(spanR.ColorG0, spanR.ColorG1, i, spanR.IRecip, idiff); + xspan.ColorB1 = InterpolateAttrLinear(spanR.ColorB0, spanR.ColorB1, i, spanR.IRecip, idiff); + + xspan.W1 = spanR.W0; + } + } + + if (xspan.W0 == xspan.W1 && ((xspan.W0 | xspan.W1) & 0x7F) == 0) + { + xspan.Flags |= XSpanSetup_Linear; +// a bit hacky, but when wbuffering we only need to calculate xrecip for linear spans +#ifdef ZBuffer + } + { +#endif + uint r; + xspan.XRecip = int(Div(1U<<30, uint(xspan.X1 - xspan.X0), r)); + } + + XSpanSetups[gl_GlobalInvocationID.x] = xspan; +} + +)"; + +const std::string ClearIndirectWorkCount = + BinningBuffer + R"( + +layout (local_size_x = 32) in; + +void main() +{ + VariantWorkCount[gl_GlobalInvocationID.x] = uvec4(1, 1, 0, 0); +} + +)"; + +const std::string ClearCoarseBinMask = + BinningBuffer + R"( +layout (local_size_x = 32) in; + +void main() +{ + BinningMaskAndOffset[BinningCoarseMaskStart + gl_GlobalInvocationID.x*CoarseBinStride+0] = 0; + BinningMaskAndOffset[BinningCoarseMaskStart + gl_GlobalInvocationID.x*CoarseBinStride+1] = 0; +} + +)"; + +const std::string BinCombined = + PolygonBuffer + + BinningBuffer + + XSpanSetupBuffer + + WorkDescBuffer + R"( + +layout (local_size_x = 32) in; + +bool BinPolygon(Polygon polygon, ivec2 topLeft, ivec2 botRight) +{ + if (polygon.YTop > botRight.y || polygon.YBot <= topLeft.y) + return false; + + int polygonHeight = polygon.YBot - polygon.YTop; + + /* + All (good) polygons are convex. So the following holds true: + + Starting from the top most point where both edges originate + the X coordinate of the left edge will stay the same or falls until + the minimum X-axis coordinate is reached. Then it stays the same or + rises until the point it meets with the right edge. + + The same applies to the right edge, except that it first may rise or stay equal and + after the maximum point may only fall or stay the same. + + This means that for every tile which doesn't contain the point where the direction changes + we can just get the maximum point by sampling the top most and bottom most coordinate + within the tile. + + For a tile which is that the height of the direction change + + As a sidenote another consequence of this design decision is + that malformed polygons aren't binned properly. + + As a note bottom Y is exclusive! + */ + int polyInnerTopY = clamp(topLeft.y - polygon.YTop, 0, max(polygonHeight-1, 0)); + int polyInnerBotY = clamp(botRight.y - polygon.YTop, 0, max(polygonHeight-1, 0)); + + XSpanSetup xspanTop = XSpanSetups[polygon.FirstXSpan + polyInnerTopY]; + XSpanSetup xspanBot = XSpanSetups[polygon.FirstXSpan + polyInnerBotY]; + + int minXL; + if (polygon.XMinY >= topLeft.y && polygon.XMinY <= botRight.y) + minXL = polygon.XMin; + else + minXL = min(xspanTop.X0, xspanBot.X0); + + if (minXL > botRight.x) + return false; + + int maxXR; + if (polygon.XMaxY >= topLeft.y && polygon.XMaxY <= botRight.y) + maxXR = polygon.XMax; + else + maxXR = max(xspanTop.X1, xspanBot.X1) - 1; + + if (maxXR < topLeft.x) + return false; + + return true; +} + +shared uint mergedMaskShared; + +void main() +{ + int groupIdx = int(gl_WorkGroupID.x); + ivec2 coarseTile = ivec2(gl_WorkGroupID.yz); + +#if 0 + int localIdx = int(gl_SubGroupInvocationARB); +#else + int localIdx = int(gl_LocalInvocationIndex); + + if (localIdx == 0) + mergedMaskShared = 0U; + barrier(); +#endif + + int polygonIdx = groupIdx * 32 + localIdx; + + ivec2 coarseTopLeft = coarseTile * ivec2(CoarseTileW, CoarseTileH); + ivec2 coarseBotRight = coarseTopLeft + ivec2(CoarseTileW-1, CoarseTileH-1); + + bool binned = false; + if (polygonIdx < NumPolygons) + { + binned = BinPolygon(Polygons[polygonIdx], coarseTopLeft, coarseBotRight); + } + +#if 0 + uint mergedMask = unpackUint2x32(ballotARB(binned)).x; +#else + if (binned) + atomicOr(mergedMaskShared, 1U << localIdx); + barrier(); + uint mergedMask = mergedMaskShared; +#endif + + ivec2 fineTile = ivec2(localIdx & 0x7, localIdx >> 3); + + ivec2 fineTileTopLeft = coarseTopLeft + fineTile * ivec2(TileSize, TileSize); + ivec2 fineTileBotRight = fineTileTopLeft + ivec2(TileSize-1, TileSize-1); + + uint binnedMask = 0U; + while (mergedMask != 0U) + { + int bit = findLSB(mergedMask); + mergedMask &= ~(1U << bit); + + int polygonIdx = groupIdx * 32 + bit; + + if (BinPolygon(Polygons[polygonIdx], fineTileTopLeft, fineTileBotRight)) + binnedMask |= 1U << bit; + } + + int linearTile = fineTile.x + fineTile.y * TilesPerLine + coarseTile.x * CoarseTileCountX + coarseTile.y * TilesPerLine * CoarseTileCountY; + + BinningMaskAndOffset[BinningMaskStart + linearTile * BinStride + groupIdx] = binnedMask; + int coarseMaskIdx = linearTile * CoarseBinStride + (groupIdx >> 5); + if (binnedMask != 0U) + atomicOr(BinningMaskAndOffset[BinningCoarseMaskStart + coarseMaskIdx], 1U << (groupIdx & 0x1F)); + + if (binnedMask != 0U) + { + uint workOffset = atomicAdd(VariantWorkCount[0].w, uint(bitCount(binnedMask))); + BinningMaskAndOffset[BinningWorkOffsetsStart + linearTile * BinStride + groupIdx] = workOffset; + + uint tilePositionCombined = bitfieldInsert(fineTileTopLeft.x, fineTileTopLeft.y, 16, 16); + + int idx = 0; + while (binnedMask != 0U) + { + int bit = findLSB(binnedMask); + binnedMask &= ~(1U << bit); + + int polygonIdx = groupIdx * 32 + bit; + int variantIdx = Polygons[polygonIdx].Variant; + + int inVariantOffset = int(atomicAdd(VariantWorkCount[variantIdx].z, 1)); + WorkDescs[WorkDescsUnsortedStart + workOffset + idx] = uvec2(tilePositionCombined, bitfieldInsert(polygonIdx, inVariantOffset, 12, 20)); + + idx++; + } + } +} + +)"; + +const std::string CalcOffsets = + BinningBuffer + R"( + +layout (local_size_x = 32) in; + +void main() +{ + if (gl_GlobalInvocationID.x < NumVariants) + { + if (gl_GlobalInvocationID.x == 0) + { + // a bit of a cheat putting this here, but this shader won't run that often + SortWorkWorkCount = uvec4((VariantWorkCount[0].w + 31) / 32, 1, 1, 0); + } + SortedWorkOffset[gl_GlobalInvocationID.x] = atomicAdd(VariantWorkCount[1].w, VariantWorkCount[gl_GlobalInvocationID.x].z); + } +} + + +)"; + +const std::string SortWork = + PolygonBuffer + + BinningBuffer + + WorkDescBuffer + R"( + +layout (local_size_x = 32) in; + +void main() +{ + if (gl_GlobalInvocationID.x < VariantWorkCount[0].w) + { + uvec2 workDesc = WorkDescs[WorkDescsUnsortedStart + gl_GlobalInvocationID.x]; + int inVariantOffset = int(bitfieldExtract(workDesc.y, 12, 20)); + int polygonIdx = int(bitfieldExtract(workDesc.y, 0, 12)); + int variantIdx = Polygons[polygonIdx].Variant; + + int sortedIndex = int(SortedWorkOffset[variantIdx]) + inVariantOffset; + WorkDescs[WorkDescsSortedStart + sortedIndex] = uvec2(workDesc.x, bitfieldInsert(workDesc.y, gl_GlobalInvocationID.x, 12, 20)); + } +} + +)"; + +const std::string Rasterise = + PolygonBuffer + + WorkDescBuffer + + XSpanSetupBuffer + + BinningBuffer + + Tilebuffers + R"( + +layout (local_size_x = TileSize, local_size_y = TileSize) in; + +layout (binding = 0) uniform usampler2DArray CurrentTexture; + +layout (location = 0) uniform uint CurVariant; +layout (location = 1) uniform vec2 InvTextureSize; + +void main() +{ + uvec2 workDesc = WorkDescs[WorkDescsSortedStart + SortedWorkOffset[CurVariant] + gl_WorkGroupID.z]; + Polygon polygon = Polygons[bitfieldExtract(workDesc.y, 0, 12)]; + ivec2 position = ivec2(bitfieldExtract(workDesc.x, 0, 16), bitfieldExtract(workDesc.x, 16, 16)) + ivec2(gl_LocalInvocationID.xy); + int tileOffset = int(bitfieldExtract(workDesc.y, 12, 20)) * TileSize * TileSize + TileSize * int(gl_LocalInvocationID.y) + int(gl_LocalInvocationID.x); + + uint color = 0U; + if (position.y >= polygon.YTop && position.y < polygon.YBot) + { + XSpanSetup xspan = XSpanSetups[polygon.FirstXSpan + (position.y - polygon.YTop)]; + + bool insideLeftEdge = position.x < xspan.InsideStart; + bool insideRightEdge = position.x >= xspan.InsideEnd; + bool insidePolygonInside = !insideLeftEdge && !insideRightEdge; + + if (position.x >= xspan.X0 && position.x < xspan.X1 + && ((insideLeftEdge && (xspan.Flags & XSpanSetup_FillLeft) != 0U) + || (insideRightEdge && (xspan.Flags & XSpanSetup_FillRight) != 0U) + || (insidePolygonInside && (xspan.Flags & XSpanSetup_FillInside) != 0U))) + { + uint attr = 0; + if (position.y == polygon.YTop) + attr |= 0x4U; + else if (position.y == polygon.YBot - 1) + attr |= 0x8U; + + if (insideLeftEdge) + { + attr |= 0x1U; + + int cov = xspan.EdgeCovL; + if (cov < 0) + { + int xcov = xspan.CovLInitial + (xspan.EdgeCovL & 0x3FF) * (position.x - xspan.X0); + cov = min(xcov >> 5, 31); + } + + attr |= uint(cov) << 8; + } + else if (insideRightEdge) + { + attr |= 0x2U; + + int cov = xspan.EdgeCovR; + if (cov < 0) + { + int xcov = xspan.CovRInitial + (xspan.EdgeCovR & 0x3FF) * (position.x - xspan.InsideEnd); + cov = max(0x1F - (xcov >> 5), 0); + } + + attr |= uint(cov) << 8; + } + + uint z; + int u, v, vr, vg, vb; + + if (xspan.X0 == xspan.X1) + { + z = xspan.Z0; + u = xspan.TexcoordU0; + v = xspan.TexcoordV0; + vr = xspan.ColorR0; + vg = xspan.ColorG0; + vb = xspan.ColorB0; + } + else + { + int ifactor = CalcYFactorX(xspan, position.x); + int idiff = xspan.X1 - xspan.X0; + int i = position.x - xspan.X0; + +#ifdef ZBuffer + z = InterpolateZZBuffer(xspan.Z0, xspan.Z1, i, xspan.XRecip, idiff); +#endif +#ifdef WBuffer + z = InterpolateZWBuffer(xspan.Z0, xspan.Z1, ifactor); +#endif + if ((xspan.Flags & XSpanSetup_Linear) == 0U) + { + u = InterpolateAttrPersp(xspan.TexcoordU0, xspan.TexcoordU1, ifactor); + v = InterpolateAttrPersp(xspan.TexcoordV0, xspan.TexcoordV1, ifactor); + + vr = InterpolateAttrPersp(xspan.ColorR0, xspan.ColorR1, ifactor); + vg = InterpolateAttrPersp(xspan.ColorG0, xspan.ColorG1, ifactor); + vb = InterpolateAttrPersp(xspan.ColorB0, xspan.ColorB1, ifactor); + } + else + { + u = InterpolateAttrLinear(xspan.TexcoordU0, xspan.TexcoordU1, i, xspan.XRecip, idiff); + v = InterpolateAttrLinear(xspan.TexcoordV0, xspan.TexcoordV1, i, xspan.XRecip, idiff); + + vr = InterpolateAttrLinear(xspan.ColorR0, xspan.ColorR1, i, xspan.XRecip, idiff); + vg = InterpolateAttrLinear(xspan.ColorG0, xspan.ColorG1, i, xspan.XRecip, idiff); + vb = InterpolateAttrLinear(xspan.ColorB0, xspan.ColorB1, i, xspan.XRecip, idiff); + } + } + +#ifndef ShadowMask + vr >>= 3; + vg >>= 3; + vb >>= 3; + + uint r, g, b, a; + uint polyalpha = bitfieldExtract(polygon.Attr, 16, 5); + +#ifdef Toon + uint tooncolor = ToonTable[vr >> 1].r; + vr = int(bitfieldExtract(tooncolor, 0, 8)); + vg = int(bitfieldExtract(tooncolor, 8, 8)); + vb = int(bitfieldExtract(tooncolor, 16, 8)); +#endif +#ifdef Highlight + vg = vr; + vb = vr; +#endif + +#ifdef NoTexture + a = int(polyalpha); +#endif + r = vr; + g = vg; + b = vb; + +#ifdef UseTexture + vec2 uvf = vec2(ivec2(u, v)) * vec2(1.0 / 16.0) * InvTextureSize; + + uvec4 texcolor = texture(CurrentTexture, vec3(uvf, polygon.TextureLayer)); +#ifdef Decal + if (texcolor.a == 31) + { + r = int(texcolor.r); + g = int(texcolor.g); + b = int(texcolor.b); + } + else if (texcolor.a > 0) + { + r = int((texcolor.r * texcolor.a) + (vr * (31-texcolor.a))) >> 5; + g = int((texcolor.g * texcolor.a) + (vg * (31-texcolor.a))) >> 5; + b = int((texcolor.b * texcolor.a) + (vb * (31-texcolor.a))) >> 5; + } + a = int(polyalpha); +#endif +#if defined(Modulate) || defined(Toon) || defined(Highlight) + r = int((texcolor.r+1) * (vr+1) - 1) >> 6; + g = int((texcolor.g+1) * (vg+1) - 1) >> 6; + b = int((texcolor.b+1) * (vb+1) - 1) >> 6; + a = int((texcolor.a+1) * (polyalpha+1) - 1) >> 5; +#endif +#endif + +#ifdef Highlight + uint tooncolor = ToonTable[vr >> 1].r; + + r = min(r + int(bitfieldExtract(tooncolor, 0, 8)), 63); + g = min(g + int(bitfieldExtract(tooncolor, 8, 8)), 63); + b = min(b + int(bitfieldExtract(tooncolor, 16, 8)), 63); +#endif + + if (polyalpha == 0) + a = 31; + + if (a > AlphaRef) + { + color = r | (g << 8) | (b << 16) | (a << 24); + + DepthTiles[tileOffset] = z; + AttrTiles[tileOffset] = attr; + } +#else + color = 0xFFFFFFFF; // doesn't really matter as long as it's not 0 + DepthTiles[tileOffset] = z; +#endif + } + } + + ColorTiles[tileOffset] = color; +} + +)"; + +const std::string DepthBlend = + PolygonBuffer + + Tilebuffers + + ResultBuffer + + BinningBuffer + R"( + +layout (local_size_x = TileSize, local_size_y = TileSize) in; + +void PlotTranslucent(inout uint color, inout uint depth, inout uint attr, bool isShadow, uint tileColor, uint srcA, uint tileDepth, uint srcAttr, bool writeDepth) +{ + uint blendAttr = (srcAttr & 0xE0F0U) | ((srcAttr >> 8) & 0xFF0000U) | (1U<<22) | (attr & 0xFF001F0FU); + + if ((!isShadow || (attr & (1U<<22)) != 0U) + ? (attr & 0x007F0000U) != (blendAttr & 0x007F0000U) + : (attr & 0x3F000000U) != (srcAttr & 0x3F000000U)) + { + // le blend + if (writeDepth) + depth = tileDepth; + + if ((attr & (1U<<15)) == 0) + blendAttr &= ~(1U<<15); + attr = blendAttr; + + uint srcRB = tileColor & 0x3F003FU; + uint srcG = tileColor & 0x003F00U; + uint dstRB = color & 0x3F003FU; + uint dstG = color & 0x003F00U; + uint dstA = color & 0x1F000000U; + + uint alpha = (srcA >> 24) + 1; + if (dstA != 0) + { + srcRB = ((srcRB * alpha) + (dstRB * (32-alpha))) >> 5; + srcG = ((srcG * alpha) + (dstG * (32-alpha))) >> 5; + } + + color = (srcRB & 0x3F003FU) | (srcG & 0x003F00U) | max(dstA, srcA); + } +} + +void ProcessCoarseMask(int linearTile, uint coarseMask, uint coarseOffset, + inout uvec2 color, inout uvec2 depth, inout uvec2 attr, inout uint stencil, + inout bool prevIsShadowMask) +{ + int tileInnerOffset = int(gl_LocalInvocationID.x) + int(gl_LocalInvocationID.y) * TileSize; + + while (coarseMask != 0U) + { + uint coarseBit = findLSB(coarseMask); + coarseMask &= ~(1U << coarseBit); + + uint tileOffset = linearTile * BinStride + coarseBit + coarseOffset; + + uint fineMask = BinningMaskAndOffset[BinningMaskStart + tileOffset]; + uint workIdx = BinningMaskAndOffset[BinningWorkOffsetsStart + tileOffset]; + + while (fineMask != 0U) + { + uint fineIdx = findLSB(fineMask); + fineMask &= ~(1U << fineIdx); + + uint pixelindex = tileInnerOffset + workIdx * TileSize * TileSize; + uint tileColor = ColorTiles[pixelindex]; + workIdx++; + + uint polygonIdx = fineIdx + (coarseBit + coarseOffset) * 32; + + if (tileColor != 0U) + { + uint polygonAttr = Polygons[polygonIdx].Attr; + + bool isShadowMask = ((polygonAttr & 0x3F000030U) == 0x00000030U); + bool prevIsShadowMaskOld = prevIsShadowMask; + prevIsShadowMask = isShadowMask; + + bool equalDepthTest = (polygonAttr & (1U << 14)) != 0U; + + uint tileDepth = DepthTiles[pixelindex]; + uint tileAttr = AttrTiles[pixelindex]; + + uint dstattr = attr.x; + + if (!isShadowMask) + { + bool isShadow = (polygonAttr & 0x30U) == 0x30U; + + bool writeSecondLayer = false; + + if (isShadow) + { + if (stencil == 0U) + continue; + if ((stencil & 1U) == 0U) + writeSecondLayer = true; + if ((stencil & 2U) == 0U) + dstattr &= ~0x3U; + } + + uint dstDepth = writeSecondLayer ? depth.y : depth.x; + if (!(equalDepthTest +#ifdef WBuffer + ? dstDepth - tileDepth + 0xFFU <= 0x1FE +#endif +#ifdef ZBuffer + ? dstDepth - tileDepth + 0x200 <= 0x400 +#endif + : tileDepth < dstDepth)) + { + if ((dstattr & 0x3U) == 0U || writeSecondLayer) + continue; + + writeSecondLayer = true; + dstattr = attr.y; + if (!(equalDepthTest +#ifdef WBuffer + ? depth.y - tileDepth + 0xFFU <= 0x1FE +#endif +#ifdef ZBuffer + ? depth.y - tileDepth + 0x200 <= 0x400 +#endif + : tileDepth < depth.y)) + continue; + } + + uint srcAttr = (polygonAttr & 0x3F008000U); + + uint srcA = tileColor & 0x1F000000U; + if (srcA == 0x1F000000U) + { + srcAttr |= tileAttr; + + if (!writeSecondLayer) + { + if ((srcAttr & 0x3U) != 0U) + { + color.y = color.x; + depth.y = depth.x; + attr.y = attr.x; + } + + color.x = tileColor; + depth.x = tileDepth; + attr.x = srcAttr; + } + else + { + color.y = tileColor; + depth.y = tileDepth; + attr.y = srcAttr; + } + } + else + { + bool writeDepth = (polygonAttr & (1U<<11)) != 0; + + if (!writeSecondLayer) + { + // blend into both layers + PlotTranslucent(color.x, depth.x, attr.x, isShadow, tileColor, srcA, tileDepth, srcAttr, writeDepth); + } + if (writeSecondLayer || (dstattr & 0x3U) != 0U) + { + PlotTranslucent(color.y, depth.y, attr.y, isShadow, tileColor, srcA, tileDepth, srcAttr, writeDepth); + } + } + } + else + { + if (!prevIsShadowMaskOld) + stencil = 0; + + if (!(equalDepthTest +#ifdef WBuffer + ? depth.x - tileDepth + 0xFFU <= 0x1FE +#endif +#ifdef ZBuffer + ? depth.x - tileDepth + 0x200 <= 0x400 +#endif + : tileDepth < depth.x)) + stencil = 0x1U; + + if ((dstattr & 0x3U) != 0U) + { + if (!(equalDepthTest +#ifdef WBuffer + ? depth.y - tileDepth + 0xFFU <= 0x1FE +#endif +#ifdef ZBuffer + ? depth.y - tileDepth + 0x200 <= 0x400 +#endif + : tileDepth < depth.y)) + stencil |= 0x2U; + } + } + } + } + } +} + +void main() +{ + int linearTile = int(gl_WorkGroupID.x + (gl_WorkGroupID.y * TilesPerLine)); + + uint coarseMaskLo = BinningMaskAndOffset[BinningCoarseMaskStart + linearTile*CoarseBinStride + 0]; + uint coarseMaskHi = BinningMaskAndOffset[BinningCoarseMaskStart + linearTile*CoarseBinStride + 1]; + + uvec2 color = uvec2(ClearColor, 0U); + uvec2 depth = uvec2(ClearDepth, 0U); + uvec2 attr = uvec2(ClearAttr, 0U); + uint stencil = 0U; + bool prevIsShadowMask = false; + + ProcessCoarseMask(linearTile, coarseMaskLo, 0, color, depth, attr, stencil, prevIsShadowMask); + ProcessCoarseMask(linearTile, coarseMaskHi, BinStride/2, color, depth, attr, stencil, prevIsShadowMask); + + int resultOffset = int(gl_GlobalInvocationID.x) + int(gl_GlobalInvocationID.y) * ScreenWidth; + ResultValue[ResultColorStart+resultOffset] = color.x; + ResultValue[ResultColorStart+resultOffset+FramebufferStride] = color.y; + ResultValue[ResultDepthStart+resultOffset] = depth.x; + ResultValue[ResultDepthStart+resultOffset+FramebufferStride] = depth.y; + ResultValue[ResultAttrStart+resultOffset] = attr.x; + ResultValue[ResultAttrStart+resultOffset+FramebufferStride] = attr.y; +} + +)"; + +const std::string FinalPass = + ResultBuffer + R"( + +layout (local_size_x = 32) in; + +layout (binding = 0, rgba8) writeonly uniform image2D FinalFB; +layout (binding = 1, rgba8ui) writeonly uniform uimage2D LowResFB; + +uint BlendFog(uint color, uint depth) +{ + uint densityid = 0, densityfrac = 0; + + if (depth >= FogOffset) + { + depth -= FogOffset; + depth = (depth >> 2) << FogShift; + + densityid = depth >> 17; + if (densityid >= 32) + { + densityid = 32; + densityfrac = 0; + } + else + { + densityfrac = depth & 0x1FFFFU; + } + } + + uint density = + ((ToonTable[densityid].g * (0x20000U-densityfrac)) + + (ToonTable[densityid+1].g * densityfrac)) >> 17; + density = min(density, 128U); + + uint colorRB = color & 0x3F003FU; + uint colorGA = (color >> 8) & 0x3F003FU; + + uint fogRB = FogColor & 0x3F003FU; + uint fogGA = (FogColor >> 8) & 0x1F003FU; + + uint finalColorRB = ((fogRB * density) + (colorRB * (128-density))) >> 7; + uint finalColorGA = ((fogGA * density) + (colorGA * (128-density))) >> 7; + + finalColorRB &= 0x3F003FU; + finalColorGA &= 0x1F003FU; + + return (DispCnt & (1U<<6)) != 0 + ? (bitfieldInsert(color, finalColorGA >> 16, 24, 8)) + : (finalColorRB | (finalColorGA << 8)); +} + +void main() +{ + int srcX = int(gl_GlobalInvocationID.x); + int resultOffset = int(srcX) + int(gl_GlobalInvocationID.y) * ScreenWidth; + + uvec2 color = uvec2(ResultValue[resultOffset+ResultColorStart], ResultValue[resultOffset+FramebufferStride+ResultColorStart]); + uvec2 depth = uvec2(ResultValue[resultOffset+ResultDepthStart], ResultValue[resultOffset+FramebufferStride+ResultDepthStart]); + uvec2 attr = uvec2(ResultValue[resultOffset+ResultAttrStart], ResultValue[resultOffset+FramebufferStride+ResultAttrStart]); + +#ifdef EdgeMarking + if ((attr.x & 0xFU) != 0U) + { + uvec4 otherAttr = uvec4(ClearAttr); + uvec4 otherDepth = uvec4(ClearDepth); + + if (srcX > 0U) + { + otherAttr.x = ResultValue[resultOffset-1+ResultAttrStart]; + otherDepth.x = ResultValue[resultOffset-1+ResultDepthStart]; + } + if (srcX < ScreenWidth-1) + { + otherAttr.y = ResultValue[resultOffset+1+ResultAttrStart]; + otherDepth.y = ResultValue[resultOffset+1+ResultDepthStart]; + } + if (gl_GlobalInvocationID.y > 0U) + { + otherAttr.z = ResultValue[resultOffset-ScreenWidth+ResultAttrStart]; + otherDepth.z = ResultValue[resultOffset-ScreenWidth+ResultDepthStart]; + } + if (gl_GlobalInvocationID.y < ScreenHeight-1) + { + otherAttr.w = ResultValue[resultOffset+ScreenWidth+ResultAttrStart]; + otherDepth.w = ResultValue[resultOffset+ScreenWidth+ResultDepthStart]; + } + + uint polyId = bitfieldExtract(attr.x, 24, 6); + uvec4 otherPolyId = bitfieldExtract(otherAttr, 24, 6); + + bvec4 polyIdMismatch = notEqual(uvec4(polyId), otherPolyId); + bvec4 nearer = lessThan(uvec4(depth.x), otherDepth); + + if ((polyIdMismatch.x && nearer.x) + || (polyIdMismatch.y && nearer.y) + || (polyIdMismatch.z && nearer.z) + || (polyIdMismatch.w && nearer.w)) + { + color.x = ToonTable[polyId >> 3].b | (color.x & 0xFF000000U); + attr.x = (attr.x & 0xFFFFE0FFU) | 0x00001000U; + } + } +#endif + +#ifdef Fog + if ((attr.x & (1U<<15)) != 0U) + { + color.x = BlendFog(color.x, depth.x); + } + + if ((attr.x & 0xFU) != 0 && (attr.y & (1U<<15)) != 0U) + { + color.y = BlendFog(color.y, depth.y); + } +#endif + +#ifdef AntiAliasing + // resolve anti-aliasing + if ((attr.x & 0x3U) != 0) + { + uint coverage = (attr.x >> 8) & 0x1FU; + + if (coverage != 0) + { + uint topRB = color.x & 0x3F003FU; + uint topG = color.x & 0x003F00U; + uint topA = bitfieldExtract(color.x, 24, 5); + + uint botRB = color.y & 0x3F003FU; + uint botG = color.y & 0x003F00U; + uint botA = bitfieldExtract(color.y, 24, 5); + + coverage++; + + if (botA > 0) + { + topRB = ((topRB * coverage) + (botRB * (32-coverage))) >> 5; + topG = ((topG * coverage) + (botG * (32-coverage))) >> 5; + + topRB &= 0x3F003FU; + topG &= 0x003F00U; + } + + topA = ((topA * coverage) + (botA * (32-coverage))) >> 5; + + color.x = topRB | topG | (topA << 24); + } + else + { + color.x = color.y; + } + } +#endif + +// if (bitfieldExtract(color.x, 24, 8) != 0U) +// color.x |= 0x40000000U; +// else +// color.x = 0U; + + //if ((gl_GlobalInvocationID.y % 8) == 7 || (gl_GlobalInvocationID.y % 8) == 7) + // color.x = 0x1F00001FU | 0x40000000U; + + vec4 result = vec4(bitfieldExtract(color.x, 16, 8), bitfieldExtract(color.x, 8, 8), color.x & 0x3FU, bitfieldExtract(color.x, 24, 8)); + result /= vec4(63.0, 63.0, 63.0, 31.0); + imageStore(FinalFB, ivec2(gl_GlobalInvocationID.xy), result); + + // It's a division by constant, so using the builtin division is fine + const int scale = ScreenWidth/256; + ivec2 lowresCoordinate = ivec2(gl_GlobalInvocationID.xy) / scale; + ivec2 lowresCoordinateRest = ivec2(gl_GlobalInvocationID.xy) % scale; + if (lowresCoordinateRest == ivec2(0, 0)) + { + uvec4 color8; + color8.x = bitfieldExtract(color.x, 0, 8); + color8.y = bitfieldExtract(color.x, 8, 8); + color8.z = bitfieldExtract(color.x, 16, 8); + color8.w = bitfieldExtract(color.x, 24, 8); + imageStore(LowResFB, lowresCoordinate, color8); + } +} + +)"; + +} + +} + +#endif \ No newline at end of file diff --git a/src/GPU3D_OpenGL.cpp b/src/GPU3D_OpenGL.cpp index 3e9ce5b013..9088f0788a 100644 --- a/src/GPU3D_OpenGL.cpp +++ b/src/GPU3D_OpenGL.cpp @@ -28,46 +28,32 @@ namespace melonDS { -bool GLRenderer::BuildRenderShader(u32 flags, const char* vs, const char* fs) +bool GLRenderer::BuildRenderShader(u32 flags, const std::string& vs, const std::string& fs) { char shadername[32]; snprintf(shadername, sizeof(shadername), "RenderShader%02X", flags); int headerlen = strlen(kShaderHeader); - int vslen = strlen(vs); - int vsclen = strlen(kRenderVSCommon); - char* vsbuf = new char[headerlen + vsclen + vslen + 1]; - strcpy(&vsbuf[0], kShaderHeader); - strcpy(&vsbuf[headerlen], kRenderVSCommon); - strcpy(&vsbuf[headerlen + vsclen], vs); + std::string vsbuf; + vsbuf += kShaderHeader; + vsbuf += kRenderVSCommon; + vsbuf += vs; - int fslen = strlen(fs); - int fsclen = strlen(kRenderFSCommon); - char* fsbuf = new char[headerlen + fsclen + fslen + 1]; - strcpy(&fsbuf[0], kShaderHeader); - strcpy(&fsbuf[headerlen], kRenderFSCommon); - strcpy(&fsbuf[headerlen + fsclen], fs); + std::string fsbuf; + fsbuf += kShaderHeader; + fsbuf += kRenderFSCommon; + fsbuf += fs; - bool ret = OpenGL::BuildShaderProgram(vsbuf, fsbuf, RenderShader[flags], shadername); - - delete[] vsbuf; - delete[] fsbuf; + GLuint prog; + bool ret = OpenGL::CompileVertexFragmentProgram(prog, + vsbuf, fsbuf, + shadername, + {{"vPosition", 0}, {"vColor", 1}, {"vTexcoord", 2}, {"vPolygonAttr", 3}}, + {{"oColor", 0}, {"oAttr", 1}}); if (!ret) return false; - GLuint prog = RenderShader[flags][2]; - - glBindAttribLocation(prog, 0, "vPosition"); - glBindAttribLocation(prog, 1, "vColor"); - glBindAttribLocation(prog, 2, "vTexcoord"); - glBindAttribLocation(prog, 3, "vPolygonAttr"); - glBindFragDataLocation(prog, 0, "oColor"); - glBindFragDataLocation(prog, 1, "oAttr"); - - if (!OpenGL::LinkShaderProgram(RenderShader[flags])) - return false; - GLint uni_id = glGetUniformBlockIndex(prog, "uConfig"); glUniformBlockBinding(prog, uni_id, 0); @@ -78,13 +64,15 @@ bool GLRenderer::BuildRenderShader(u32 flags, const char* vs, const char* fs) uni_id = glGetUniformLocation(prog, "TexPalMem"); glUniform1i(uni_id, 1); + RenderShader[flags] = prog; + return true; } void GLRenderer::UseRenderShader(u32 flags) { if (CurShaderID == flags) return; - glUseProgram(RenderShader[flags][2]); + glUseProgram(RenderShader[flags]); CurShaderID = flags; } @@ -125,21 +113,17 @@ std::unique_ptr GLRenderer::New() noexcept glDepthRange(0, 1); glClearDepth(1.0); - - if (!OpenGL::BuildShaderProgram(kClearVS, kClearFS, result->ClearShaderPlain, "ClearShader")) - return nullptr; - - glBindAttribLocation(result->ClearShaderPlain[2], 0, "vPosition"); - glBindFragDataLocation(result->ClearShaderPlain[2], 0, "oColor"); - glBindFragDataLocation(result->ClearShaderPlain[2], 1, "oAttr"); - - if (!OpenGL::LinkShaderProgram(result->ClearShaderPlain)) + if (!OpenGL::CompileVertexFragmentProgram(result->ClearShaderPlain, + kClearVS, kClearFS, + "ClearShader", + {{"vPosition", 0}}, + {{"oColor", 0}, {"oAttr", 1}})) return nullptr; - result->ClearUniformLoc[0] = glGetUniformLocation(result->ClearShaderPlain[2], "uColor"); - result->ClearUniformLoc[1] = glGetUniformLocation(result->ClearShaderPlain[2], "uDepth"); - result->ClearUniformLoc[2] = glGetUniformLocation(result->ClearShaderPlain[2], "uOpaquePolyID"); - result->ClearUniformLoc[3] = glGetUniformLocation(result->ClearShaderPlain[2], "uFogFlag"); + result->ClearUniformLoc[0] = glGetUniformLocation(result->ClearShaderPlain, "uColor"); + result->ClearUniformLoc[1] = glGetUniformLocation(result->ClearShaderPlain, "uDepth"); + result->ClearUniformLoc[2] = glGetUniformLocation(result->ClearShaderPlain, "uOpaquePolyID"); + result->ClearUniformLoc[3] = glGetUniformLocation(result->ClearShaderPlain, "uFogFlag"); memset(result->RenderShader, 0, sizeof(RenderShader)); @@ -167,42 +151,35 @@ std::unique_ptr GLRenderer::New() noexcept if (!result->BuildRenderShader(RenderFlag_ShadowMask | RenderFlag_WBuffer, kRenderVS_W, kRenderFS_WSM)) return nullptr; - if (!OpenGL::BuildShaderProgram(kFinalPassVS, kFinalPassEdgeFS, result->FinalPassEdgeShader, "FinalPassEdgeShader")) - return nullptr; - - if (!OpenGL::BuildShaderProgram(kFinalPassVS, kFinalPassFogFS, result->FinalPassFogShader, "FinalPassFogShader")) + if (!OpenGL::CompileVertexFragmentProgram(result->FinalPassEdgeShader, + kFinalPassVS, kFinalPassEdgeFS, + "FinalPassEdgeShader", + {{"vPosition", 0}}, + {{"oColor", 0}})) return nullptr; - - glBindAttribLocation(result->FinalPassEdgeShader[2], 0, "vPosition"); - glBindFragDataLocation(result->FinalPassEdgeShader[2], 0, "oColor"); - - if (!OpenGL::LinkShaderProgram(result->FinalPassEdgeShader)) + if (!OpenGL::CompileVertexFragmentProgram(result->FinalPassFogShader, + kFinalPassVS, kFinalPassFogFS, + "FinalPassFogShader", + {{"vPosition", 0}}, + {{"oColor", 0}})) return nullptr; - GLint uni_id = glGetUniformBlockIndex(result->FinalPassEdgeShader[2], "uConfig"); - glUniformBlockBinding(result->FinalPassEdgeShader[2], uni_id, 0); - - glUseProgram(result->FinalPassEdgeShader[2]); + GLuint uni_id = glGetUniformBlockIndex(result->FinalPassEdgeShader, "uConfig"); + glUniformBlockBinding(result->FinalPassEdgeShader, uni_id, 0); - uni_id = glGetUniformLocation(result->FinalPassEdgeShader[2], "DepthBuffer"); + glUseProgram(result->FinalPassEdgeShader); + uni_id = glGetUniformLocation(result->FinalPassEdgeShader, "DepthBuffer"); glUniform1i(uni_id, 0); - uni_id = glGetUniformLocation(result->FinalPassEdgeShader[2], "AttrBuffer"); + uni_id = glGetUniformLocation(result->FinalPassEdgeShader, "AttrBuffer"); glUniform1i(uni_id, 1); - glBindAttribLocation(result->FinalPassFogShader[2], 0, "vPosition"); - glBindFragDataLocation(result->FinalPassFogShader[2], 0, "oColor"); - - if (!OpenGL::LinkShaderProgram(result->FinalPassFogShader)) - return nullptr; - - uni_id = glGetUniformBlockIndex(result->FinalPassFogShader[2], "uConfig"); - glUniformBlockBinding(result->FinalPassFogShader[2], uni_id, 0); - - glUseProgram(result->FinalPassFogShader[2]); + uni_id = glGetUniformBlockIndex(result->FinalPassFogShader, "uConfig"); + glUniformBlockBinding(result->FinalPassFogShader, uni_id, 0); - uni_id = glGetUniformLocation(result->FinalPassFogShader[2], "DepthBuffer"); + glUseProgram(result->FinalPassFogShader); + uni_id = glGetUniformLocation(result->FinalPassFogShader, "DepthBuffer"); glUniform1i(uni_id, 0); - uni_id = glGetUniformLocation(result->FinalPassFogShader[2], "AttrBuffer"); + uni_id = glGetUniformLocation(result->FinalPassFogShader, "AttrBuffer"); glUniform1i(uni_id, 1); @@ -255,29 +232,26 @@ std::unique_ptr GLRenderer::New() noexcept glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, result->IndexBufferID); glBufferData(GL_ELEMENT_ARRAY_BUFFER, sizeof(IndexBuffer), nullptr, GL_DYNAMIC_DRAW); - glGenFramebuffers(4, &result->FramebufferID[0]); - glBindFramebuffer(GL_FRAMEBUFFER, result->FramebufferID[0]); - - glGenTextures(8, &result->FramebufferTex[0]); - result->FrontBuffer = 0; + glGenFramebuffers(1, &result->MainFramebuffer); // color buffers - SetupDefaultTexParams(result->FramebufferTex[0]); - SetupDefaultTexParams(result->FramebufferTex[1]); + glGenTextures(1, &result->ColorBufferTex); + SetupDefaultTexParams(result->ColorBufferTex); // depth/stencil buffer - SetupDefaultTexParams(result->FramebufferTex[4]); - SetupDefaultTexParams(result->FramebufferTex[6]); + glGenTextures(1, &result->DepthBufferTex); + SetupDefaultTexParams(result->DepthBufferTex); // attribute buffer // R: opaque polyID (for edgemarking) // G: edge flag // B: fog flag - SetupDefaultTexParams(result->FramebufferTex[5]); - SetupDefaultTexParams(result->FramebufferTex[7]); + glGenTextures(1, &result->AttrBufferTex); + SetupDefaultTexParams(result->AttrBufferTex); // downscale framebuffer for display capture (always 256x192) - SetupDefaultTexParams(result->FramebufferTex[3]); + glGenTextures(1, &result->DownScaleBufferTex); + SetupDefaultTexParams(result->DownScaleBufferTex); glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA, 256, 192, 0, GL_RGBA, GL_UNSIGNED_BYTE, NULL); glEnable(GL_BLEND); @@ -315,8 +289,12 @@ GLRenderer::~GLRenderer() glDeleteTextures(1, &TexMemID); glDeleteTextures(1, &TexPalMemID); - glDeleteFramebuffers(4, &FramebufferID[0]); - glDeleteTextures(8, &FramebufferTex[0]); + glDeleteFramebuffers(1, &MainFramebuffer); + glDeleteFramebuffers(1, &DownscaleFramebuffer); + glDeleteTextures(1, &ColorBufferTex); + glDeleteTextures(1, &DepthBufferTex); + glDeleteTextures(1, &AttrBufferTex); + glDeleteTextures(1, &DownScaleBufferTex); glDeleteVertexArrays(1, &VertexArrayID); glDeleteBuffers(1, &VertexBufferID); @@ -327,8 +305,8 @@ GLRenderer::~GLRenderer() for (int i = 0; i < 16; i++) { - if (!RenderShader[i][2]) continue; - OpenGL::DeleteShaderProgram(RenderShader[i]); + if (!RenderShader[i]) continue; + glDeleteProgram(RenderShader[i]); } } @@ -361,40 +339,25 @@ void GLRenderer::SetRenderSettings(bool betterpolygons, int scale) noexcept ScreenW = 256 * scale; ScreenH = 192 * scale; - glBindTexture(GL_TEXTURE_2D, FramebufferTex[0]); - glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA, ScreenW, ScreenH, 0, GL_RGBA, GL_UNSIGNED_BYTE, NULL); - glBindTexture(GL_TEXTURE_2D, FramebufferTex[1]); + glBindTexture(GL_TEXTURE_2D, ColorBufferTex); glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA, ScreenW, ScreenH, 0, GL_RGBA, GL_UNSIGNED_BYTE, NULL); - glBindTexture(GL_TEXTURE_2D, FramebufferTex[4]); + glBindTexture(GL_TEXTURE_2D, DepthBufferTex); glTexImage2D(GL_TEXTURE_2D, 0, GL_DEPTH24_STENCIL8, ScreenW, ScreenH, 0, GL_DEPTH_STENCIL, GL_UNSIGNED_INT_24_8, NULL); - glBindTexture(GL_TEXTURE_2D, FramebufferTex[5]); + glBindTexture(GL_TEXTURE_2D, AttrBufferTex); glTexImage2D(GL_TEXTURE_2D, 0, GL_RGB, ScreenW, ScreenH, 0, GL_RGB, GL_UNSIGNED_BYTE, NULL); - glBindTexture(GL_TEXTURE_2D, FramebufferTex[6]); - glTexImage2D(GL_TEXTURE_2D, 0, GL_DEPTH24_STENCIL8, ScreenW, ScreenH, 0, GL_DEPTH_STENCIL, GL_UNSIGNED_INT_24_8, NULL); - glBindTexture(GL_TEXTURE_2D, FramebufferTex[7]); - glTexImage2D(GL_TEXTURE_2D, 0, GL_RGB, ScreenW, ScreenH, 0, GL_RGB, GL_UNSIGNED_BYTE, NULL); - - glBindFramebuffer(GL_FRAMEBUFFER, FramebufferID[3]); - glFramebufferTexture(GL_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, FramebufferTex[3], 0); + glBindFramebuffer(GL_FRAMEBUFFER, DownscaleFramebuffer); + glFramebufferTexture(GL_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, DownScaleBufferTex, 0); GLenum fbassign[2] = {GL_COLOR_ATTACHMENT0, GL_COLOR_ATTACHMENT1}; - glBindFramebuffer(GL_FRAMEBUFFER, FramebufferID[0]); - glFramebufferTexture(GL_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, FramebufferTex[0], 0); - glFramebufferTexture(GL_FRAMEBUFFER, GL_DEPTH_STENCIL_ATTACHMENT, FramebufferTex[4], 0); - glFramebufferTexture(GL_FRAMEBUFFER, GL_COLOR_ATTACHMENT1, FramebufferTex[5], 0); - glDrawBuffers(2, fbassign); - - glBindFramebuffer(GL_FRAMEBUFFER, FramebufferID[1]); - glFramebufferTexture(GL_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, FramebufferTex[1], 0); - glFramebufferTexture(GL_FRAMEBUFFER, GL_DEPTH_STENCIL_ATTACHMENT, FramebufferTex[6], 0); - glFramebufferTexture(GL_FRAMEBUFFER, GL_COLOR_ATTACHMENT1, FramebufferTex[7], 0); + glBindFramebuffer(GL_FRAMEBUFFER, MainFramebuffer); + glFramebufferTexture(GL_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, ColorBufferTex, 0); + glFramebufferTexture(GL_FRAMEBUFFER, GL_DEPTH_STENCIL_ATTACHMENT, DepthBufferTex, 0); + glFramebufferTexture(GL_FRAMEBUFFER, GL_COLOR_ATTACHMENT1, AttrBufferTex, 0); glDrawBuffers(2, fbassign); - glBindFramebuffer(GL_FRAMEBUFFER, FramebufferID[0]); - glBindBuffer(GL_PIXEL_PACK_BUFFER, PixelbufferID); glBufferData(GL_PIXEL_PACK_BUFFER, 256*192*4, NULL, GL_DYNAMIC_READ); @@ -1103,9 +1066,9 @@ void GLRenderer::RenderSceneChunk(const GPU3D& gpu3d, int y, int h) glStencilMask(0); glActiveTexture(GL_TEXTURE0); - glBindTexture(GL_TEXTURE_2D, FramebufferTex[FrontBuffer ? 6 : 4]); + glBindTexture(GL_TEXTURE_2D, DepthBufferTex); glActiveTexture(GL_TEXTURE1); - glBindTexture(GL_TEXTURE_2D, FramebufferTex[FrontBuffer ? 7 : 5]); + glBindTexture(GL_TEXTURE_2D, AttrBufferTex); glBindBuffer(GL_ARRAY_BUFFER, ClearVertexBufferID); glBindVertexArray(ClearVertexArrayID); @@ -1115,7 +1078,7 @@ void GLRenderer::RenderSceneChunk(const GPU3D& gpu3d, int y, int h) // edge marking // TODO: depth/polyid values at screen edges - glUseProgram(FinalPassEdgeShader[2]); + glUseProgram(FinalPassEdgeShader); glBlendFuncSeparate(GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA, GL_ZERO, GL_ONE); @@ -1126,7 +1089,7 @@ void GLRenderer::RenderSceneChunk(const GPU3D& gpu3d, int y, int h) { // fog - glUseProgram(FinalPassFogShader[2]); + glUseProgram(FinalPassFogShader); if (gpu3d.RenderDispCnt & (1<<6)) glBlendFuncSeparate(GL_ZERO, GL_ONE, GL_CONSTANT_COLOR, GL_ONE_MINUS_SRC_ALPHA); @@ -1154,7 +1117,7 @@ void GLRenderer::RenderFrame(GPU& gpu) CurShaderID = -1; glBindFramebuffer(GL_READ_FRAMEBUFFER, 0); - glBindFramebuffer(GL_DRAW_FRAMEBUFFER, FramebufferID[FrontBuffer]); + glBindFramebuffer(GL_DRAW_FRAMEBUFFER, MainFramebuffer); ShaderConfig.uScreenSize[0] = ScreenW; ShaderConfig.uScreenSize[1] = ScreenH; @@ -1260,7 +1223,7 @@ void GLRenderer::RenderFrame(GPU& gpu) // TODO: check whether 'clear polygon ID' affects translucent polyID // (for example when alpha is 1..30) { - glUseProgram(ClearShaderPlain[2]); + glUseProgram(ClearShaderPlain); glDepthFunc(GL_ALWAYS); u32 r = gpu.GPU3D.RenderClearAttr1 & 0x1F; @@ -1320,8 +1283,6 @@ void GLRenderer::RenderFrame(GPU& gpu) RenderSceneChunk(gpu.GPU3D, 0, 192); } - - FrontBuffer = FrontBuffer ? 0 : 1; } void GLRenderer::Stop(const GPU& gpu) @@ -1331,16 +1292,14 @@ void GLRenderer::Stop(const GPU& gpu) void GLRenderer::PrepareCaptureFrame() { - // TODO: make sure this picks the right buffer when doing antialiasing - int original_fb = FrontBuffer^1; - - glBindFramebuffer(GL_READ_FRAMEBUFFER, FramebufferID[original_fb]); + glBindFramebuffer(GL_READ_FRAMEBUFFER, MainFramebuffer); glReadBuffer(GL_COLOR_ATTACHMENT0); - glBindFramebuffer(GL_DRAW_FRAMEBUFFER, FramebufferID[3]); + glBindFramebuffer(GL_DRAW_FRAMEBUFFER, DownscaleFramebuffer); glDrawBuffer(GL_COLOR_ATTACHMENT0); glBlitFramebuffer(0, 0, ScreenW, ScreenH, 0, 0, 256, 192, GL_COLOR_BUFFER_BIT, GL_NEAREST); - glBindFramebuffer(GL_READ_FRAMEBUFFER, FramebufferID[3]); + glBindBuffer(GL_PIXEL_PACK_BUFFER, PixelbufferID); + glBindFramebuffer(GL_READ_FRAMEBUFFER, DownscaleFramebuffer); glReadPixels(0, 0, 256, 192, GL_BGRA, GL_UNSIGNED_BYTE, NULL); } @@ -1349,12 +1308,18 @@ void GLRenderer::Blit(const GPU& gpu) CurGLCompositor.RenderFrame(gpu, *this); } +void GLRenderer::BindOutputTexture(int buffer) +{ + CurGLCompositor.BindOutputTexture(buffer); +} + u32* GLRenderer::GetLine(int line) { int stride = 256; if (line == 0) { + glBindBuffer(GL_PIXEL_PACK_BUFFER, PixelbufferID); u8* data = (u8*)glMapBuffer(GL_PIXEL_PACK_BUFFER, GL_READ_ONLY); if (data) memcpy(&Framebuffer[stride*0], data, 4*stride*192); glUnmapBuffer(GL_PIXEL_PACK_BUFFER); @@ -1374,7 +1339,7 @@ u32* GLRenderer::GetLine(int line) void GLRenderer::SetupAccelFrame() { - glBindTexture(GL_TEXTURE_2D, FramebufferTex[FrontBuffer]); + glBindTexture(GL_TEXTURE_2D, ColorBufferTex); } } diff --git a/src/GPU3D_OpenGL.h b/src/GPU3D_OpenGL.h index c30232ca30..dcab6e8706 100644 --- a/src/GPU3D_OpenGL.h +++ b/src/GPU3D_OpenGL.h @@ -44,12 +44,11 @@ class GLRenderer : public Renderer3D void Stop(const GPU& gpu) override; u32* GetLine(int line) override; - void SetupAccelFrame(); + void SetupAccelFrame() override; void PrepareCaptureFrame() override; void Blit(const GPU& gpu) override; - [[nodiscard]] const GLCompositor& GetCompositor() const noexcept { return CurGLCompositor; } - GLCompositor& GetCompositor() noexcept { return CurGLCompositor; } + void BindOutputTexture(int buffer) override; static std::unique_ptr New() noexcept; private: @@ -77,7 +76,7 @@ class GLRenderer : public Renderer3D GLCompositor CurGLCompositor; RendererPolygon PolygonList[2048] {}; - bool BuildRenderShader(u32 flags, const char* vs, const char* fs); + bool BuildRenderShader(u32 flags, const std::string& vs, const std::string& fs); void UseRenderShader(u32 flags); void SetupPolygon(RendererPolygon* rp, Polygon* polygon) const; u32* SetupVertex(const Polygon* poly, int vid, const Vertex* vtx, u32 vtxattr, u32* vptr) const; @@ -96,13 +95,13 @@ class GLRenderer : public Renderer3D }; - GLuint ClearShaderPlain[3] {}; + GLuint ClearShaderPlain {}; - GLuint RenderShader[16][3] {}; + GLuint RenderShader[16] {}; GLuint CurShaderID = -1; - GLuint FinalPassEdgeShader[3] {}; - GLuint FinalPassFogShader[3] {}; + GLuint FinalPassEdgeShader {}; + GLuint FinalPassFogShader {}; // std140 compliant structure struct @@ -155,12 +154,12 @@ class GLRenderer : public Renderer3D bool BetterPolygons {}; int ScreenW {}, ScreenH {}; - GLuint FramebufferTex[8] {}; - int FrontBuffer {}; - GLuint FramebufferID[4] {}, PixelbufferID {}; - u32 Framebuffer[256*192] {}; - + GLuint ColorBufferTex {}, DepthBufferTex {}, AttrBufferTex {}; + GLuint DownScaleBufferTex {}; + GLuint PixelbufferID {}; + GLuint MainFramebuffer {}, DownscaleFramebuffer {}; + u32 Framebuffer[256*192] {}; }; } #endif \ No newline at end of file diff --git a/src/GPU3D_Soft.cpp b/src/GPU3D_Soft.cpp index 74027d5b5e..a8da14cda6 100644 --- a/src/GPU3D_Soft.cpp +++ b/src/GPU3D_Soft.cpp @@ -95,8 +95,8 @@ void SoftRenderer::EnableRenderThread() } } -SoftRenderer::SoftRenderer(bool threaded) noexcept - : Renderer3D(false), Threaded(threaded) +SoftRenderer::SoftRenderer() noexcept + : Renderer3D(false) { Sema_RenderStart = Platform::Semaphore_Create(); Sema_RenderDone = Platform::Semaphore_Create(); diff --git a/src/GPU3D_Soft.h b/src/GPU3D_Soft.h index 9cfdf9ad5e..45b2c53999 100644 --- a/src/GPU3D_Soft.h +++ b/src/GPU3D_Soft.h @@ -29,7 +29,7 @@ namespace melonDS class SoftRenderer : public Renderer3D { public: - SoftRenderer(bool threaded = false) noexcept; + SoftRenderer() noexcept; ~SoftRenderer() override; void Reset(GPU& gpu) override; @@ -504,7 +504,7 @@ class SoftRenderer : public Renderer3D // threading - bool Threaded; + bool Threaded = false; Platform::Thread* RenderThread; std::atomic_bool RenderThreadRunning; std::atomic_bool RenderThreadRendering; diff --git a/src/GPU3D_Texcache.cpp b/src/GPU3D_Texcache.cpp new file mode 100644 index 0000000000..196009e6b6 --- /dev/null +++ b/src/GPU3D_Texcache.cpp @@ -0,0 +1,269 @@ +#include "GPU3D_Texcache.h" + +namespace melonDS +{ + +inline u16 ColorAvg(u16 color0, u16 color1) +{ + u32 r0 = color0 & 0x001F; + u32 g0 = color0 & 0x03E0; + u32 b0 = color0 & 0x7C00; + u32 r1 = color1 & 0x001F; + u32 g1 = color1 & 0x03E0; + u32 b1 = color1 & 0x7C00; + + u32 r = (r0 + r1) >> 1; + u32 g = ((g0 + g1) >> 1) & 0x03E0; + u32 b = ((b0 + b1) >> 1) & 0x7C00; + + return r | g | b; +} + +inline u16 Color5of3(u16 color0, u16 color1) +{ + u32 r0 = color0 & 0x001F; + u32 g0 = color0 & 0x03E0; + u32 b0 = color0 & 0x7C00; + u32 r1 = color1 & 0x001F; + u32 g1 = color1 & 0x03E0; + u32 b1 = color1 & 0x7C00; + + u32 r = (r0*5 + r1*3) >> 3; + u32 g = ((g0*5 + g1*3) >> 3) & 0x03E0; + u32 b = ((b0*5 + b1*3) >> 3) & 0x7C00; + + return r | g | b; +} + +inline u16 Color3of5(u16 color0, u16 color1) +{ + u32 r0 = color0 & 0x001F; + u32 g0 = color0 & 0x03E0; + u32 b0 = color0 & 0x7C00; + u32 r1 = color1 & 0x001F; + u32 g1 = color1 & 0x03E0; + u32 b1 = color1 & 0x7C00; + + u32 r = (r0*3 + r1*5) >> 3; + u32 g = ((g0*3 + g1*5) >> 3) & 0x03E0; + u32 b = ((b0*3 + b1*5) >> 3) & 0x7C00; + + return r | g | b; +} + +inline u32 ConvertRGB5ToRGB8(u16 val) +{ + return (((u32)val & 0x1F) << 3) + | (((u32)val & 0x3E0) << 6) + | (((u32)val & 0x7C00) << 9); +} +inline u32 ConvertRGB5ToBGR8(u16 val) +{ + return (((u32)val & 0x1F) << 9) + | (((u32)val & 0x3E0) << 6) + | (((u32)val & 0x7C00) << 3); +} +inline u32 ConvertRGB5ToRGB6(u16 val) +{ + u8 r = (val & 0x1F) << 1; + u8 g = (val & 0x3E0) >> 4; + u8 b = (val & 0x7C00) >> 9; + if (r) r++; + if (g) g++; + if (b) b++; + return (u32)r | ((u32)g << 8) | ((u32)b << 16); +} + +template +void ConvertBitmapTexture(u32 width, u32 height, u32* output, u8* texData) +{ + for (u32 i = 0; i < width*height; i++) + { + u16 value = *(u16*)&texData[i * 2]; + + switch (outputFmt) + { + case outputFmt_RGB6A5: + output[i] = ConvertRGB5ToRGB6(value) | (value & 0x8000 ? 0x1F000000 : 0); + break; + case outputFmt_RGBA8: + output[i] = ConvertRGB5ToRGB8(value) | (value & 0x8000 ? 0xFF000000 : 0); + break; + case outputFmt_BGRA8: + output[i] = ConvertRGB5ToBGR8(value) | (value & 0x8000 ? 0xFF000000 : 0); + break; + } + } +} + +template void ConvertBitmapTexture(u32 width, u32 height, u32* output, u8* texData); + +template +void ConvertCompressedTexture(u32 width, u32 height, u32* output, u8* texData, u8* texAuxData, u16* palData) +{ + // we process a whole block at the time + for (int y = 0; y < height / 4; y++) + { + for (int x = 0; x < width / 4; x++) + { + u32 data = ((u32*)texData)[x + y * (width / 4)]; + u16 auxData = ((u16*)texAuxData)[x + y * (width / 4)]; + + u32 paletteOffset = auxData & 0x3FFF; + u16 color0 = palData[paletteOffset*2] | 0x8000; + u16 color1 = palData[paletteOffset*2+1] | 0x8000; + u16 color2, color3; + + switch ((auxData >> 14) & 0x3) + { + case 0: + color2 = palData[paletteOffset*2+2] | 0x8000; + color3 = 0; + break; + case 1: + { + u32 r0 = color0 & 0x001F; + u32 g0 = color0 & 0x03E0; + u32 b0 = color0 & 0x7C00; + u32 r1 = color1 & 0x001F; + u32 g1 = color1 & 0x03E0; + u32 b1 = color1 & 0x7C00; + + u32 r = (r0 + r1) >> 1; + u32 g = ((g0 + g1) >> 1) & 0x03E0; + u32 b = ((b0 + b1) >> 1) & 0x7C00; + color2 = r | g | b | 0x8000; + } + color3 = 0; + break; + case 2: + color2 = palData[paletteOffset*2+2] | 0x8000; + color3 = palData[paletteOffset*2+3] | 0x8000; + break; + case 3: + { + u32 r0 = color0 & 0x001F; + u32 g0 = color0 & 0x03E0; + u32 b0 = color0 & 0x7C00; + u32 r1 = color1 & 0x001F; + u32 g1 = color1 & 0x03E0; + u32 b1 = color1 & 0x7C00; + + u32 r = (r0*5 + r1*3) >> 3; + u32 g = ((g0*5 + g1*3) >> 3) & 0x03E0; + u32 b = ((b0*5 + b1*3) >> 3) & 0x7C00; + + color2 = r | g | b | 0x8000; + } + { + u32 r0 = color0 & 0x001F; + u32 g0 = color0 & 0x03E0; + u32 b0 = color0 & 0x7C00; + u32 r1 = color1 & 0x001F; + u32 g1 = color1 & 0x03E0; + u32 b1 = color1 & 0x7C00; + + u32 r = (r0*3 + r1*5) >> 3; + u32 g = ((g0*3 + g1*5) >> 3) & 0x03E0; + u32 b = ((b0*3 + b1*5) >> 3) & 0x7C00; + + color3 = r | g | b | 0x8000; + } + break; + } + + // in 2020 our default data types are big enough to be used as lookup tables... + u64 packed = color0 | ((u64)color1 << 16) | ((u64)color2 << 32) | ((u64)color3 << 48); + + for (int j = 0; j < 4; j++) + { + for (int i = 0; i < 4; i++) + { + u16 color = (packed >> 16 * (data >> 2 * (i + j * 4))) & 0xFFFF; + u32 res; + switch (outputFmt) + { + case outputFmt_RGB6A5: res = ConvertRGB5ToRGB6(color) + | ((color & 0x8000) ? 0x1F000000 : 0); break; + case outputFmt_RGBA8: res = ConvertRGB5ToRGB8(color) + | ((color & 0x8000) ? 0xFF000000 : 0); break; + case outputFmt_BGRA8: res = ConvertRGB5ToBGR8(color) + | ((color & 0x8000) ? 0xFF000000 : 0); break; + } + output[x * 4 + i + (y * 4 + j) * width] = res; + } + } + } + } +} + +template void ConvertCompressedTexture(u32, u32, u32*, u8*, u8*, u16*); + +template +void ConvertAXIYTexture(u32 width, u32 height, u32* output, u8* texData, u16* palData) +{ + for (int y = 0; y < height; y++) + { + for (int x = 0; x < width; x++) + { + u8 val = texData[x + y * width]; + + u32 idx = val & ((1 << Y) - 1); + + u16 color = palData[idx]; + u32 alpha = (val >> Y) & ((1 << X) - 1); + if (X != 5) + alpha = alpha * 4 + alpha / 2; + + u32 res; + switch (outputFmt) + { + case outputFmt_RGB6A5: res = ConvertRGB5ToRGB6(color) | alpha << 24; break; + // make sure full alpha == 255 + case outputFmt_RGBA8: res = ConvertRGB5ToRGB8(color) | (alpha << 27 | (alpha & 0x1C) << 22); break; + case outputFmt_BGRA8: res = ConvertRGB5ToBGR8(color) | (alpha << 27 | (alpha & 0x1C) << 22); break; + } + output[x + y * width] = res; + } + } +} + +template void ConvertAXIYTexture(u32, u32, u32*, u8*, u16*); +template void ConvertAXIYTexture(u32, u32, u32*, u8*, u16*); + +template +void ConvertNColorsTexture(u32 width, u32 height, u32* output, u8* texData, u16* palData, bool color0Transparent) +{ + for (int y = 0; y < height; y++) + { + for (int x = 0; x < width / (8 / colorBits); x++) + { + u8 val = texData[x + y * (width / (8 / colorBits))]; + + for (int i = 0; i < 8 / colorBits; i++) + { + u32 index = (val >> (i * colorBits)) & ((1 << colorBits) - 1); + u16 color = palData[index]; + + bool transparent = color0Transparent && index == 0; + u32 res; + switch (outputFmt) + { + case outputFmt_RGB6A5: res = ConvertRGB5ToRGB6(color) + | (transparent ? 0 : 0x1F000000); break; + case outputFmt_RGBA8: res = ConvertRGB5ToRGB8(color) + | (transparent ? 0 : 0xFF000000); break; + case outputFmt_BGRA8: res = ConvertRGB5ToBGR8(color) + | (transparent ? 0 : 0xFF000000); break; + } + output[x * (8 / colorBits) + y * width + i] = res; + } + } + } +} + +template void ConvertNColorsTexture(u32, u32, u32*, u8*, u16*, bool); +template void ConvertNColorsTexture(u32, u32, u32*, u8*, u16*, bool); +template void ConvertNColorsTexture(u32, u32, u32*, u8*, u16*, bool); + +} \ No newline at end of file diff --git a/src/GPU3D_Texcache.h b/src/GPU3D_Texcache.h new file mode 100644 index 0000000000..214c6254fd --- /dev/null +++ b/src/GPU3D_Texcache.h @@ -0,0 +1,310 @@ +#ifndef GPU3D_TEXCACHE +#define GPU3D_TEXCACHE + +#include "types.h" +#include "GPU.h" + +#include +#include +#include + +#define XXH_STATIC_LINKING_ONLY +#include "xxhash/xxhash.h" + +namespace melonDS +{ + +inline u32 TextureWidth(u32 texparam) +{ + return 8 << ((texparam >> 20) & 0x7); +} + +inline u32 TextureHeight(u32 texparam) +{ + return 8 << ((texparam >> 23) & 0x7); +} + +enum +{ + outputFmt_RGB6A5, + outputFmt_RGBA8, + outputFmt_BGRA8 +}; + +template +void ConvertBitmapTexture(u32 width, u32 height, u32* output, u8* texData); +template +void ConvertCompressedTexture(u32 width, u32 height, u32* output, u8* texData, u8* texAuxData, u16* palData); +template +void ConvertAXIYTexture(u32 width, u32 height, u32* output, u8* texData, u16* palData); +template +void ConvertNColorsTexture(u32 width, u32 height, u32* output, u8* texData, u16* palData, bool color0Transparent); + +template +class Texcache +{ +public: + Texcache(const TexLoaderT& texloader) + : TexLoader(texloader) // probably better if this would be a move constructor??? + {} + + bool Update(GPU& gpu) + { + auto textureDirty = gpu.VRAMDirty_Texture.DeriveState(gpu.VRAMMap_Texture, gpu); + auto texPalDirty = gpu.VRAMDirty_TexPal.DeriveState(gpu.VRAMMap_TexPal, gpu); + + bool textureChanged = gpu.MakeVRAMFlat_TextureCoherent(textureDirty); + bool texPalChanged = gpu.MakeVRAMFlat_TexPalCoherent(texPalDirty); + + if (textureChanged || texPalChanged) + { + //printf("check invalidation %d\n", TexCache.size()); + for (auto it = Cache.begin(); it != Cache.end();) + { + TexCacheEntry& entry = it->second; + if (textureChanged) + { + for (u32 i = 0; i < 2; i++) + { + u32 startBit = entry.TextureRAMStart[i] / VRAMDirtyGranularity; + u32 bitsCount = ((entry.TextureRAMStart[i] + entry.TextureRAMSize[i] + VRAMDirtyGranularity - 1) / VRAMDirtyGranularity) - startBit; + + u32 startEntry = startBit >> 6; + u64 entriesCount = ((startBit + bitsCount + 0x3F) >> 6) - startEntry; + for (u32 j = startEntry; j < startEntry + entriesCount; j++) + { + if (GetRangedBitMask(j, startBit, bitsCount) & textureDirty.Data[j]) + { + u64 newTexHash = XXH3_64bits(&gpu.VRAMFlat_Texture[entry.TextureRAMStart[i]], entry.TextureRAMSize[i]); + + if (newTexHash != entry.TextureHash[i]) + goto invalidate; + } + } + } + } + + if (texPalChanged && entry.TexPalSize > 0) + { + u32 startBit = entry.TexPalStart / VRAMDirtyGranularity; + u32 bitsCount = ((entry.TexPalStart + entry.TexPalSize + VRAMDirtyGranularity - 1) / VRAMDirtyGranularity) - startBit; + + u32 startEntry = startBit >> 6; + u64 entriesCount = ((startBit + bitsCount + 0x3F) >> 6) - startEntry; + for (u32 j = startEntry; j < startEntry + entriesCount; j++) + { + if (GetRangedBitMask(j, startBit, bitsCount) & texPalDirty.Data[j]) + { + u64 newPalHash = XXH3_64bits(&gpu.VRAMFlat_TexPal[entry.TexPalStart], entry.TexPalSize); + if (newPalHash != entry.TexPalHash) + goto invalidate; + } + } + } + + it++; + continue; + invalidate: + FreeTextures[entry.WidthLog2][entry.HeightLog2].push_back(entry.Texture); + + //printf("invalidating texture %d\n", entry.ImageDescriptor); + + it = Cache.erase(it); + } + + return true; + } + + return false; + } + + void GetTexture(GPU& gpu, u32 texParam, u32 palBase, TexHandleT& textureHandle, u32& layer, u32*& helper) + { + // remove sampling and texcoord gen params + texParam &= ~0xC00F0000; + + u32 fmt = (texParam >> 26) & 0x7; + u64 key = texParam; + if (fmt != 7) + { + key |= (u64)palBase << 32; + if (fmt == 5) + key &= ~((u64)1 << 29); + } + //printf("%" PRIx64 " %" PRIx32 " %" PRIx32 "\n", key, texParam, palBase); + + assert(fmt != 0 && "no texture is not a texture format!"); + + auto it = Cache.find(key); + + if (it != Cache.end()) + { + textureHandle = it->second.Texture.TextureID; + layer = it->second.Texture.Layer; + helper = &it->second.LastVariant; + return; + } + + u32 widthLog2 = (texParam >> 20) & 0x7; + u32 heightLog2 = (texParam >> 23) & 0x7; + u32 width = 8 << widthLog2; + u32 height = 8 << heightLog2; + + u32 addr = (texParam & 0xFFFF) * 8; + + TexCacheEntry entry = {0}; + + entry.TextureRAMStart[0] = addr; + entry.WidthLog2 = widthLog2; + entry.HeightLog2 = heightLog2; + + // apparently a new texture + if (fmt == 7) + { + entry.TextureRAMSize[0] = width*height*2; + + ConvertBitmapTexture(width, height, DecodingBuffer, &gpu.VRAMFlat_Texture[addr]); + } + else if (fmt == 5) + { + u8* texData = &gpu.VRAMFlat_Texture[addr]; + u32 slot1addr = 0x20000 + ((addr & 0x1FFFC) >> 1); + if (addr >= 0x40000) + slot1addr += 0x10000; + u8* texAuxData = &gpu.VRAMFlat_Texture[slot1addr]; + + u16* palData = (u16*)(gpu.VRAMFlat_TexPal + palBase*16); + + entry.TextureRAMSize[0] = width*height/16*4; + entry.TextureRAMStart[1] = slot1addr; + entry.TextureRAMSize[1] = width*height/16*2; + entry.TexPalStart = palBase*16; + entry.TexPalSize = 0x10000; + + ConvertCompressedTexture(width, height, DecodingBuffer, texData, texAuxData, palData); + } + else + { + u32 texSize, palAddr = palBase*16, numPalEntries; + switch (fmt) + { + case 1: texSize = width*height; numPalEntries = 32; break; + case 6: texSize = width*height; numPalEntries = 8; break; + case 2: texSize = width*height/4; numPalEntries = 4; palAddr >>= 1; break; + case 3: texSize = width*height/2; numPalEntries = 16; break; + case 4: texSize = width*height; numPalEntries = 256; break; + } + + palAddr &= 0x1FFFF; + + /*printf("creating texture | fmt: %d | %dx%d | %08x | %08x\n", fmt, width, height, addr, palAddr); + svcSleepThread(1000*1000);*/ + + entry.TextureRAMSize[0] = texSize; + entry.TexPalStart = palAddr; + entry.TexPalSize = numPalEntries*2; + + u8* texData = &gpu.VRAMFlat_Texture[addr]; + u16* palData = (u16*)(gpu.VRAMFlat_TexPal + palAddr); + + //assert(entry.TexPalStart+entry.TexPalSize <= 128*1024*1024); + + bool color0Transparent = texParam & (1 << 29); + + switch (fmt) + { + case 1: ConvertAXIYTexture(width, height, DecodingBuffer, texData, palData); break; + case 6: ConvertAXIYTexture(width, height, DecodingBuffer, texData, palData); break; + case 2: ConvertNColorsTexture(width, height, DecodingBuffer, texData, palData, color0Transparent); break; + case 3: ConvertNColorsTexture(width, height, DecodingBuffer, texData, palData, color0Transparent); break; + case 4: ConvertNColorsTexture(width, height, DecodingBuffer, texData, palData, color0Transparent); break; + } + } + + for (int i = 0; i < 2; i++) + { + if (entry.TextureRAMSize[i]) + entry.TextureHash[i] = XXH3_64bits(&gpu.VRAMFlat_Texture[entry.TextureRAMStart[i]], entry.TextureRAMSize[i]); + } + if (entry.TexPalSize) + entry.TexPalHash = XXH3_64bits(&gpu.VRAMFlat_TexPal[entry.TexPalStart], entry.TexPalSize); + + auto& texArrays = TexArrays[widthLog2][heightLog2]; + auto& freeTextures = FreeTextures[widthLog2][heightLog2]; + + if (freeTextures.size() == 0) + { + texArrays.resize(texArrays.size()+1); + TexHandleT& array = texArrays[texArrays.size()-1]; + + u32 layers = std::min((8*1024*1024) / (width*height*4), 64); + + // allocate new array texture + //printf("allocating new layer set for %d %d %d %d\n", width, height, texArrays.size()-1, array.ImageDescriptor); + array = TexLoader.GenerateTexture(width, height, layers); + + for (u32 i = 0; i < layers; i++) + { + freeTextures.push_back(TexArrayEntry{array, i}); + } + } + + TexArrayEntry storagePlace = freeTextures[freeTextures.size()-1]; + freeTextures.pop_back(); + + entry.Texture = storagePlace; + + TexLoader.UploadTexture(storagePlace.TextureID, width, height, storagePlace.Layer, DecodingBuffer); + //printf("using storage place %d %d | %d %d (%d)\n", width, height, storagePlace.TexArrayIdx, storagePlace.LayerIdx, array.ImageDescriptor); + + textureHandle = storagePlace.TextureID; + layer = storagePlace.Layer; + helper = &Cache.emplace(std::make_pair(key, entry)).first->second.LastVariant; + } + + void Reset() + { + for (u32 i = 0; i < 8; i++) + { + for (u32 j = 0; j < 8; j++) + { + for (u32 k = 0; k < TexArrays[i][j].size(); k++) + TexLoader.DeleteTexture(TexArrays[i][j][k]); + TexArrays[i][j].clear(); + FreeTextures[i][j].clear(); + } + } + Cache.clear(); + } +private: + struct TexArrayEntry + { + TexHandleT TextureID; + u32 Layer; + }; + + struct TexCacheEntry + { + u32 LastVariant; // very cheap way to make variant lookup faster + + u32 TextureRAMStart[2], TextureRAMSize[2]; + u32 TexPalStart, TexPalSize; + u8 WidthLog2, HeightLog2; + TexArrayEntry Texture; + + u64 TextureHash[2]; + u64 TexPalHash; + }; + std::unordered_map Cache; + + TexLoaderT TexLoader; + + std::vector FreeTextures[8][8]; + std::vector TexArrays[8][8]; + + u32 DecodingBuffer[1024*1024]; +}; + +} + +#endif \ No newline at end of file diff --git a/src/GPU3D_TexcacheOpenGL.cpp b/src/GPU3D_TexcacheOpenGL.cpp new file mode 100644 index 0000000000..95ca8cdc8d --- /dev/null +++ b/src/GPU3D_TexcacheOpenGL.cpp @@ -0,0 +1,29 @@ +#include "GPU3D_TexcacheOpenGL.h" + +namespace melonDS +{ + +GLuint TexcacheOpenGLLoader::GenerateTexture(u32 width, u32 height, u32 layers) +{ + GLuint texarray; + glGenTextures(1, &texarray); + glBindTexture(GL_TEXTURE_2D_ARRAY, texarray); + glTexStorage3D(GL_TEXTURE_2D_ARRAY, 1, GL_RGBA8UI, width, height, layers); + return texarray; +} + +void TexcacheOpenGLLoader::UploadTexture(GLuint handle, u32 width, u32 height, u32 layer, void* data) +{ + glBindTexture(GL_TEXTURE_2D_ARRAY, handle); + glTexSubImage3D(GL_TEXTURE_2D_ARRAY, + 0, 0, 0, layer, + width, height, 1, + GL_RGBA_INTEGER, GL_UNSIGNED_BYTE, data); +} + +void TexcacheOpenGLLoader::DeleteTexture(GLuint handle) +{ + glDeleteTextures(1, &handle); +} + +} \ No newline at end of file diff --git a/src/GPU3D_TexcacheOpenGL.h b/src/GPU3D_TexcacheOpenGL.h new file mode 100644 index 0000000000..a8cfa576d9 --- /dev/null +++ b/src/GPU3D_TexcacheOpenGL.h @@ -0,0 +1,25 @@ +#ifndef GPU3D_TEXCACHEOPENGL +#define GPU3D_TEXCACHEOPENGL + +#include "GPU3D_Texcache.h" +#include "OpenGLSupport.h" + +namespace melonDS +{ + +template +class Texcache; + +class TexcacheOpenGLLoader +{ +public: + GLuint GenerateTexture(u32 width, u32 height, u32 layers); + void UploadTexture(GLuint handle, u32 width, u32 height, u32 layer, void* data); + void DeleteTexture(GLuint handle); +}; + +using TexcacheOpenGL = Texcache; + +} + +#endif \ No newline at end of file diff --git a/src/GPU_OpenGL.cpp b/src/GPU_OpenGL.cpp index 2e2857cedf..6084405b26 100644 --- a/src/GPU_OpenGL.cpp +++ b/src/GPU_OpenGL.cpp @@ -36,32 +36,27 @@ using namespace OpenGL; std::optional GLCompositor::New() noexcept { assert(glBindAttribLocation != nullptr); + GLuint CompShader {}; - std::array CompShader {}; - if (!OpenGL::BuildShaderProgram(kCompositorVS, kCompositorFS_Nearest, &CompShader[0], "CompositorShader")) - return std::nullopt; - - glBindAttribLocation(CompShader[2], 0, "vPosition"); - glBindAttribLocation(CompShader[2], 1, "vTexcoord"); - glBindFragDataLocation(CompShader[2], 0, "oColor"); - - if (!OpenGL::LinkShaderProgram(CompShader.data())) - // OpenGL::LinkShaderProgram already deletes the shader program object - // if linking the shaders together failed. + if (!OpenGL::CompileVertexFragmentProgram(CompShader, + kCompositorVS, kCompositorFS_Nearest, + "CompositorShader", + {{"vPosition", 0}, {"vTexcoord", 1}}, + {{"oColor", 0}})) return std::nullopt; return { GLCompositor(CompShader) }; } -GLCompositor::GLCompositor(std::array compShader) noexcept : CompShader(compShader) +GLCompositor::GLCompositor(GLuint compShader) noexcept : CompShader(compShader) { - CompScaleLoc = glGetUniformLocation(CompShader[2], "u3DScale"); - Comp3DXPosLoc = glGetUniformLocation(CompShader[2], "u3DXPos"); + CompScaleLoc = glGetUniformLocation(CompShader, "u3DScale"); + Comp3DXPosLoc = glGetUniformLocation(CompShader, "u3DXPos"); - glUseProgram(CompShader[2]); - GLuint screenTextureUniform = glGetUniformLocation(CompShader[2], "ScreenTex"); + glUseProgram(CompShader); + GLuint screenTextureUniform = glGetUniformLocation(CompShader, "ScreenTex"); glUniform1i(screenTextureUniform, 0); - GLuint _3dTextureUniform = glGetUniformLocation(CompShader[2], "_3DTex"); + GLuint _3dTextureUniform = glGetUniformLocation(CompShader, "_3DTex"); glUniform1i(_3dTextureUniform, 1); // all this mess is to prevent bleeding @@ -136,7 +131,7 @@ GLCompositor::~GLCompositor() glDeleteVertexArrays(1, &CompVertexArrayID); glDeleteBuffers(1, &CompVertexBufferID); - OpenGL::DeleteShaderProgram(CompShader.data()); + glDeleteProgram(CompShader); } @@ -174,7 +169,7 @@ GLCompositor& GLCompositor::operator=(GLCompositor&& other) noexcept CompVertices = other.CompVertices; // Clean up these resources before overwriting them - OpenGL::DeleteShaderProgram(CompShader.data()); + glDeleteProgram(CompShader); CompShader = other.CompShader; glDeleteBuffers(1, &CompVertexBufferID); @@ -244,11 +239,11 @@ void GLCompositor::Stop(const GPU& gpu) noexcept glBindFramebuffer(GL_FRAMEBUFFER, 0); } -void GLCompositor::RenderFrame(const GPU& gpu, GLRenderer& renderer) noexcept +void GLCompositor::RenderFrame(const GPU& gpu, Renderer3D& renderer) noexcept { - int frontbuf = gpu.FrontBuffer; + int backbuf = gpu.FrontBuffer ^ 1; glBindFramebuffer(GL_READ_FRAMEBUFFER, 0); - glBindFramebuffer(GL_DRAW_FRAMEBUFFER, CompScreenOutputFB[frontbuf]); + glBindFramebuffer(GL_DRAW_FRAMEBUFFER, CompScreenOutputFB[backbuf]); glDisable(GL_DEPTH_TEST); glDisable(GL_STENCIL_TEST); @@ -260,7 +255,7 @@ void GLCompositor::RenderFrame(const GPU& gpu, GLRenderer& renderer) noexcept glClear(GL_COLOR_BUFFER_BIT); // TODO: select more shaders (filtering, etc) - OpenGL::UseShaderProgram(CompShader.data()); + glUseProgram(CompShader); glUniform1ui(CompScaleLoc, Scale); // TODO: support setting this midframe, if ever needed @@ -269,12 +264,12 @@ void GLCompositor::RenderFrame(const GPU& gpu, GLRenderer& renderer) noexcept glActiveTexture(GL_TEXTURE0); glBindTexture(GL_TEXTURE_2D, CompScreenInputTex); - if (gpu.Framebuffer[frontbuf][0] && gpu.Framebuffer[frontbuf][1]) + if (gpu.Framebuffer[backbuf][0] && gpu.Framebuffer[backbuf][1]) { glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, 256*3 + 1, 192, GL_RGBA_INTEGER, - GL_UNSIGNED_BYTE, gpu.Framebuffer[frontbuf][0].get()); + GL_UNSIGNED_BYTE, gpu.Framebuffer[backbuf][0].get()); glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 192, 256*3 + 1, 192, GL_RGBA_INTEGER, - GL_UNSIGNED_BYTE, gpu.Framebuffer[frontbuf][1].get()); + GL_UNSIGNED_BYTE, gpu.Framebuffer[backbuf][1].get()); } glActiveTexture(GL_TEXTURE1); diff --git a/src/GPU_OpenGL.h b/src/GPU_OpenGL.h index 9c040966df..e9f4b17366 100644 --- a/src/GPU_OpenGL.h +++ b/src/GPU_OpenGL.h @@ -28,6 +28,7 @@ namespace melonDS class GPU; struct RenderSettings; class GLRenderer; +class Renderer3D; class GLCompositor { public: @@ -42,14 +43,14 @@ class GLCompositor [[nodiscard]] int GetScaleFactor() const noexcept { return Scale; } void Stop(const GPU& gpu) noexcept; - void RenderFrame(const GPU& gpu, GLRenderer& renderer) noexcept; + void RenderFrame(const GPU& gpu, Renderer3D& renderer) noexcept; void BindOutputTexture(int buf); private: - GLCompositor(std::array CompShader) noexcept; + GLCompositor(GLuint CompShader) noexcept; int Scale = 0; int ScreenH = 0, ScreenW = 0; - std::array CompShader {}; + GLuint CompShader {}; GLuint CompScaleLoc = 0; GLuint Comp3DXPosLoc = 0; diff --git a/src/NonStupidBitfield.h b/src/NonStupidBitfield.h index 4a5550f16e..a3cc4b2ec2 100644 --- a/src/NonStupidBitfield.h +++ b/src/NonStupidBitfield.h @@ -26,11 +26,38 @@ #include #include +namespace melonDS +{ + +inline u64 GetRangedBitMask(u32 idx, u32 startBit, u32 bitsCount) +{ + u32 startEntry = startBit >> 6; + u64 entriesCount = ((startBit + bitsCount + 0x3F) >> 6) - startEntry; + + if (entriesCount > 1) + { + if (idx == startEntry) + return 0xFFFFFFFFFFFFFFFF << (startBit & 0x3F); + if (((startBit + bitsCount) & 0x3F) && idx == startEntry + entriesCount - 1) + return ~(0xFFFFFFFFFFFFFFFF << ((startBit + bitsCount) & 0x3F)); + + return 0xFFFFFFFFFFFFFFFF; + } + else if (idx == startEntry) + { + return bitsCount == 64 + ? 0xFFFFFFFFFFFFFFFF + : ((1ULL << bitsCount) - 1) << (startBit & 0x3F); + } + else + { + return 0; + } +} + // like std::bitset but less stupid and optimised for // our use case (keeping track of memory invalidations) -namespace melonDS -{ template struct NonStupidBitField { @@ -166,6 +193,11 @@ struct NonStupidBitField return Ref{*this, idx}; } + bool operator[](u32 idx) const + { + return Data[idx >> 6] & (1ULL << (idx & 0x3F)); + } + void SetRange(u32 startBit, u32 bitsCount) { u32 startEntry = startBit >> 6; @@ -187,6 +219,26 @@ struct NonStupidBitField } } + int Min() const + { + for (int i = 0; i < DataLength; i++) + { + if (Data[i]) + return i * 64 + __builtin_ctzll(Data[i]); + } + return -1; + } + + int Max() const + { + for (int i = DataLength - 1; i >= 0; i--) + { + if (Data[i]) + return i * 64 + (63 - __builtin_clzll(Data[i])); + } + return -1; + } + NonStupidBitField& operator|=(const NonStupidBitField& other) { for (u32 i = 0; i < DataLength; i++) @@ -195,6 +247,7 @@ struct NonStupidBitField } return *this; } + NonStupidBitField& operator&=(const NonStupidBitField& other) { for (u32 i = 0; i < DataLength; i++) @@ -203,6 +256,20 @@ struct NonStupidBitField } return *this; } + + operator bool() const + { + for (int i = 0; i < DataLength - 1; i++) + { + if (Data[i]) + return true; + } + if (Data[DataLength-1] & ((Size&0x3F) ? ~(0xFFFFFFFFFFFFFFFF << (Size&0x3F)) : 0xFFFFFFFFFFFFFFFF)) + { + return true; + } + return false; + } }; } diff --git a/src/OpenGLSupport.cpp b/src/OpenGLSupport.cpp index 0eb05c531a..a7d000ce33 100644 --- a/src/OpenGLSupport.cpp +++ b/src/OpenGLSupport.cpp @@ -18,6 +18,14 @@ #include "OpenGLSupport.h" +#include +#include + +#include + +#define XXH_STATIC_LINKING_ONLY +#include "xxhash/xxhash.h" + namespace melonDS { @@ -27,72 +35,192 @@ using Platform::LogLevel; namespace OpenGL { -bool BuildShaderProgram(const char* vs, const char* fs, GLuint* ids, const char* name) +struct ShaderCacheEntry { - int len; - int res; + u32 Length; + u8* Data; + u32 BinaryFormat; - if (!glCreateShader) + ShaderCacheEntry(u8* data, u32 length, u32 binaryFmt) + : Length(length), Data(data), BinaryFormat(binaryFmt) { - Log(LogLevel::Error, "OpenGL: Cannot build shader program, OpenGL hasn't been loaded\n"); - return false; + assert(data != nullptr); } - ids[0] = glCreateShader(GL_VERTEX_SHADER); - len = strlen(vs); - glShaderSource(ids[0], 1, &vs, &len); - glCompileShader(ids[0]); + ShaderCacheEntry(const ShaderCacheEntry&) = delete; + ShaderCacheEntry(ShaderCacheEntry&& other) + { + Data = other.Data; + Length = other.Length; + BinaryFormat = other.BinaryFormat; - glGetShaderiv(ids[0], GL_COMPILE_STATUS, &res); - if (res != GL_TRUE) + other.Data = nullptr; + other.Length = 0; + other.BinaryFormat = 0; + } + + ~ShaderCacheEntry() { - glGetShaderiv(ids[0], GL_INFO_LOG_LENGTH, &res); - if (res < 1) res = 1024; - char* log = new char[res+1]; - glGetShaderInfoLog(ids[0], res+1, NULL, log); - Log(LogLevel::Error, "OpenGL: failed to compile vertex shader %s: %s\n", name, log); - Log(LogLevel::Debug, "shader source:\n--\n%s\n--\n", vs); - delete[] log; + if (Data) // check whether it was moved + delete[] Data; + } +}; + +std::unordered_map ShaderCache; +std::vector NewShaders; + +constexpr u32 ShaderCacheMagic = 0x11CAC4E1; +constexpr u32 ShaderCacheVersion = 1; + +void LoadShaderCache() +{ + // for now the shader cache only contains only compute shaders + // because they take the longest to compile + Platform::FileHandle* file = Platform::OpenLocalFile("shadercache", Platform::FileMode::Read); + if (file == nullptr) + { + Log(LogLevel::Error, "Could not find shader cache\n"); + return; + } + + u32 magic, version, numPrograms; + if (Platform::FileRead(&magic, 4, 1, file) != 1 || magic != ShaderCacheMagic) + { + Log(LogLevel::Error, "Shader cache file has invalid magic\n"); + goto fileInvalid; + } + + if (Platform::FileRead(&version, 4, 1, file) != 1 || version != ShaderCacheVersion) + { + Log(LogLevel::Error, "Shader cache file has bad version\n"); + goto fileInvalid; + } + + if (Platform::FileRead(&numPrograms, 4, 1, file) != 1) + { + Log(LogLevel::Error, "Shader cache file invalid program count\n"); + goto fileInvalid; + } + + // not the best approach, because once changes pile up + // we read and overwrite the old files + for (u32 i = 0; i < numPrograms; i++) + { + int error = 3; + + u32 length, binaryFormat; + u64 sourceHash; + error -= Platform::FileRead(&sourceHash, 8, 1, file); + error -= Platform::FileRead(&length, 4, 1, file); + error -= Platform::FileRead(&binaryFormat, 4, 1, file); + + if (error != 0) + { + Log(LogLevel::Error, "Invalid shader cache entry\n"); + goto fileInvalid; + } + + u8* data = new u8[length]; + if (Platform::FileRead(data, length, 1, file) != 1) + { + Log(LogLevel::Error, "Could not read shader cache entry data\n"); + delete[] data; + goto fileInvalid; + } + + ShaderCache.erase(sourceHash); + ShaderCache.emplace(sourceHash, ShaderCacheEntry(data, length, binaryFormat)); + } - glDeleteShader(ids[0]); +fileInvalid: + Platform::CloseFile(file); +} +void SaveShaderCache() +{ + Platform::FileHandle* file = Platform::OpenLocalFile("shadercache", Platform::FileMode::ReadWrite); + + if (file == nullptr) + { + Log(LogLevel::Error, "Could not open or create shader cache file\n"); + return; + } + + int written = 3; + u32 magic = ShaderCacheMagic, version = ShaderCacheVersion, numPrograms = ShaderCache.size(); + written -= Platform::FileWrite(&magic, 4, 1, file); + written -= Platform::FileWrite(&version, 4, 1, file); + written -= Platform::FileWrite(&numPrograms, 4, 1, file); + + if (written != 0) + { + Log(LogLevel::Error, "Could not write shader cache header\n"); + goto writeError; + } + + Platform::FileSeek(file, 0, Platform::FileSeekOrigin::End); + + printf("new shaders %d\n", NewShaders.size()); + + for (u64 newShader : NewShaders) + { + int error = 4; + auto it = ShaderCache.find(newShader); + + error -= Platform::FileWrite(&it->first, 8, 1, file); + error -= Platform::FileWrite(&it->second.Length, 4, 1, file); + error -= Platform::FileWrite(&it->second.BinaryFormat, 4, 1, file); + error -= Platform::FileWrite(it->second.Data, it->second.Length, 1, file); + + if (error != 0) + { + Log(LogLevel::Error, "Could not insert new shader cache entry\n"); + goto writeError; + } + } + +writeError: + Platform::CloseFile(file); + + NewShaders.clear(); +} + +bool CompilerShader(GLuint& id, const std::string& source, const std::string& name, const std::string& type) +{ + int res; + + if (!glCreateShader) + { + Log(LogLevel::Error, "OpenGL: Cannot build shader program, OpenGL hasn't been loaded\n"); return false; } - ids[1] = glCreateShader(GL_FRAGMENT_SHADER); - len = strlen(fs); - glShaderSource(ids[1], 1, &fs, &len); - glCompileShader(ids[1]); + const char* sourceC = source.c_str(); + int len = source.length(); + glShaderSource(id, 1, &sourceC, &len); + + glCompileShader(id); - glGetShaderiv(ids[1], GL_COMPILE_STATUS, &res); + glGetShaderiv(id, GL_COMPILE_STATUS, &res); if (res != GL_TRUE) { - glGetShaderiv(ids[1], GL_INFO_LOG_LENGTH, &res); + glGetShaderiv(id, GL_INFO_LOG_LENGTH, &res); if (res < 1) res = 1024; char* log = new char[res+1]; - glGetShaderInfoLog(ids[1], res+1, NULL, log); - Log(LogLevel::Error, "OpenGL: failed to compile fragment shader %s: %s\n", name, log); - //printf("shader source:\n--\n%s\n--\n", fs); + glGetShaderInfoLog(id, res+1, NULL, log); + Log(LogLevel::Error, "OpenGL: failed to compile %s shader %s: %s\n", type.c_str(), name.c_str(), log); + Log(LogLevel::Debug, "shader source:\n--\n%s\n--\n", source.c_str()); delete[] log; - Platform::FileHandle* logf = Platform::OpenFile("shaderfail.log", Platform::FileMode::WriteText); - Platform::FileWrite(fs, len+1, 1, logf); - Platform::CloseFile(logf); - - glDeleteShader(ids[0]); - glDeleteShader(ids[1]); + glDeleteShader(id); return false; } - ids[2] = glCreateProgram(); - glAttachShader(ids[2], ids[0]); - glAttachShader(ids[2], ids[1]); - return true; } -bool LinkShaderProgram(GLuint* ids) +bool LinkProgram(GLuint& result, GLuint* ids, int numIds) { int res; @@ -102,46 +230,132 @@ bool LinkShaderProgram(GLuint* ids) return false; } - glLinkProgram(ids[2]); + for (int i = 0; i < numIds; i++) + { + glAttachShader(result, ids[i]); + } - glDetachShader(ids[2], ids[0]); - glDetachShader(ids[2], ids[1]); + glLinkProgram(result); - glDeleteShader(ids[0]); - glDeleteShader(ids[1]); + for (int i = 0; i < numIds; i++) + glDetachShader(result, ids[i]); - glGetProgramiv(ids[2], GL_LINK_STATUS, &res); + glGetProgramiv(result, GL_LINK_STATUS, &res); if (res != GL_TRUE) { - glGetProgramiv(ids[2], GL_INFO_LOG_LENGTH, &res); + glGetProgramiv(result, GL_INFO_LOG_LENGTH, &res); if (res < 1) res = 1024; char* log = new char[res+1]; - glGetProgramInfoLog(ids[2], res+1, NULL, log); + glGetProgramInfoLog(result, res+1, NULL, log); Log(LogLevel::Error, "OpenGL: failed to link shader program: %s\n", log); delete[] log; - glDeleteProgram(ids[2]); - return false; } return true; } -void DeleteShaderProgram(GLuint* ids) +bool CompileComputeProgram(GLuint& result, const std::string& source, const std::string& name) { - if (glDeleteProgram) - { // If OpenGL isn't loaded, then there's no shader program to delete - glDeleteProgram(ids[2]); + result = glCreateProgram(); + + /*u64 sourceHash = XXH64(source.data(), source.size(), 0); + auto it = ShaderCache.find(sourceHash); + if (it != ShaderCache.end()) + { + glProgramBinary(result, it->second.BinaryFormat, it->second.Data, it->second.Length); + + GLint linkStatus; + glGetProgramiv(result, GL_LINK_STATUS, &linkStatus); + if (linkStatus == GL_TRUE) + { + Log(LogLevel::Info, "Restored shader %s from cache\n", name.c_str()); + return true; + } + else + { + } + }*/ + Log(LogLevel::Error, "Shader %s from cache was rejected\n", name.c_str()); + + GLuint shader; + bool linkingSucess = false; + + if (!glCreateShader || !glDeleteShader) + goto error; + + shader = glCreateShader(GL_COMPUTE_SHADER); + + if (!CompilerShader(shader, source, name, "compute")) + goto error; + + linkingSucess = LinkProgram(result, &shader, 1); + +error: + glDeleteShader(shader); + + if (!linkingSucess) + { + glDeleteProgram(result); } + /*else + { + GLint length; + GLenum format; + glGetProgramiv(result, GL_PROGRAM_BINARY_LENGTH, &length); + + u8* buffer = new u8[length]; + glGetProgramBinary(result, length, nullptr, &format, buffer); + + ShaderCache.emplace(sourceHash, ShaderCacheEntry(buffer, length, format)); + NewShaders.push_back(sourceHash); + }*/ + + return linkingSucess; } -void UseShaderProgram(GLuint* ids) +bool CompileVertexFragmentProgram(GLuint& result, + const std::string& vs, const std::string& fs, + const std::string& name, + const std::initializer_list& vertexInAttrs, + const std::initializer_list& fragmentOutAttrs) { - if (glUseProgram) - { // If OpenGL isn't loaded, then there's no shader program to use - glUseProgram(ids[2]); + GLuint shaders[2] = + { + glCreateShader(GL_VERTEX_SHADER), + glCreateShader(GL_FRAGMENT_SHADER) + }; + result = glCreateProgram(); + + bool linkingSucess = false; + + if (!CompilerShader(shaders[0], vs, name, "vertex")) + goto error; + + if (!CompilerShader(shaders[1], fs, name, "fragment")) + goto error; + + + for (const AttributeTarget& target : vertexInAttrs) + { + glBindAttribLocation(result, target.Location, target.Name); } + for (const AttributeTarget& target : fragmentOutAttrs) + { + glBindFragDataLocation(result, target.Location, target.Name); + } + + linkingSucess = LinkProgram(result, shaders, 2); + +error: + glDeleteShader(shaders[1]); + glDeleteShader(shaders[0]); + + if (!linkingSucess) + glDeleteProgram(result); + + return linkingSucess; } } diff --git a/src/OpenGLSupport.h b/src/OpenGLSupport.h index ee5b50432e..f8c4430026 100644 --- a/src/OpenGLSupport.h +++ b/src/OpenGLSupport.h @@ -28,10 +28,23 @@ namespace melonDS::OpenGL { -bool BuildShaderProgram(const char* vs, const char* fs, GLuint* ids, const char* name); -bool LinkShaderProgram(GLuint* ids); -void DeleteShaderProgram(GLuint* ids); -void UseShaderProgram(GLuint* ids); +void LoadShaderCache(); +void SaveShaderCache(); + +struct AttributeTarget +{ + const char* Name; + u32 Location; +}; + + +bool CompileVertexFragmentProgram(GLuint& result, + const std::string& vs, const std::string& fs, + const std::string& name, + const std::initializer_list& vertexInAttrs, + const std::initializer_list& fragmentOutAttrs); + +bool CompileComputeProgram(GLuint& result, const std::string& source, const std::string& name); } diff --git a/src/frontend/qt_sdl/Config.cpp b/src/frontend/qt_sdl/Config.cpp index 2fdfc3badc..d6d018259c 100644 --- a/src/frontend/qt_sdl/Config.cpp +++ b/src/frontend/qt_sdl/Config.cpp @@ -22,6 +22,7 @@ #include #include "Platform.h" #include "Config.h" +#include "GPU.h" namespace Config @@ -59,6 +60,7 @@ bool Threaded3D; int GL_ScaleFactor; bool GL_BetterPolygons; +bool GL_HiresCoordinates; bool LimitFPS; int MaxFPS; @@ -246,11 +248,12 @@ ConfigEntry ConfigFile[] = {"ScreenVSync", 1, &ScreenVSync, false, false}, {"ScreenVSyncInterval", 0, &ScreenVSyncInterval, 1, false}, - {"3DRenderer", 0, &_3DRenderer, 0, false}, + {"3DRenderer", 0, &_3DRenderer, renderer3D_Software, false}, {"Threaded3D", 1, &Threaded3D, true, false}, {"GL_ScaleFactor", 0, &GL_ScaleFactor, 1, false}, {"GL_BetterPolygons", 1, &GL_BetterPolygons, false, false}, + {"GL_HiresCoordinates", 1, &GL_HiresCoordinates, true, false}, {"LimitFPS", 1, &LimitFPS, true, false}, {"MaxFPS", 0, &MaxFPS, 1000, false}, diff --git a/src/frontend/qt_sdl/Config.h b/src/frontend/qt_sdl/Config.h index 722384a316..38a1c34c75 100644 --- a/src/frontend/qt_sdl/Config.h +++ b/src/frontend/qt_sdl/Config.h @@ -51,6 +51,16 @@ enum micInputType_MAX, }; +enum +{ + renderer3D_Software = 0, +#ifdef OGLRENDERER_ENABLED + renderer3D_OpenGL, + renderer3D_OpenGLCompute, +#endif + renderer3D_Max, +}; + namespace Config { @@ -103,6 +113,7 @@ extern bool Threaded3D; extern int GL_ScaleFactor; extern bool GL_BetterPolygons; +extern bool GL_HiresCoordinates; extern bool LimitFPS; extern int MaxFPS; diff --git a/src/frontend/qt_sdl/EmuThread.cpp b/src/frontend/qt_sdl/EmuThread.cpp index d16aead4a1..abb0e8ac6c 100644 --- a/src/frontend/qt_sdl/EmuThread.cpp +++ b/src/frontend/qt_sdl/EmuThread.cpp @@ -52,10 +52,12 @@ #include "DSi_I2C.h" #include "GPU3D_Soft.h" #include "GPU3D_OpenGL.h" +#include "GPU3D_Compute.h" #include "Savestate.h" #include "ROMManager.h" +#include "EmuThread.h" //#include "ArchiveUtil.h" //#include "CameraManager.h" @@ -94,9 +96,8 @@ EmuThread::EmuThread(QObject* parent) : QThread(parent) } std::unique_ptr EmuThread::CreateConsole( - std::unique_ptr&& ndscart, - std::unique_ptr&& gbacart -) noexcept + std::unique_ptr &&ndscart, + std::unique_ptr &&gbacart) noexcept { auto arm7bios = ROMManager::LoadARM7BIOS(); if (!arm7bios) @@ -326,21 +327,12 @@ void EmuThread::run() videoRenderer = 0; } - if (videoRenderer == 0) - { // If we're using the software renderer... - NDS->GPU.SetRenderer3D(std::make_unique(Config::Threaded3D != 0)); - } - else - { - auto glrenderer = melonDS::GLRenderer::New(); - glrenderer->SetRenderSettings(Config::GL_BetterPolygons, Config::GL_ScaleFactor); - NDS->GPU.SetRenderer3D(std::move(glrenderer)); - } + updateRenderer(); Input::Init(); u32 nframes = 0; - double perfCountsSec = 1.0 / SDL_GetPerformanceFrequency(); + perfCountsSec = 1.0 / SDL_GetPerformanceFrequency(); double lastTime = SDL_GetPerformanceCounter() * perfCountsSec; double frameLimitError = 0.0; double lastMeasureTime = lastTime; @@ -451,20 +443,9 @@ void EmuThread::run() videoRenderer = 0; } - videoRenderer = screenGL ? Config::_3DRenderer : 0; + updateRenderer(); videoSettingsDirty = false; - - if (videoRenderer == 0) - { // If we're using the software renderer... - NDS->GPU.SetRenderer3D(std::make_unique(Config::Threaded3D != 0)); - } - else - { - auto glrenderer = melonDS::GLRenderer::New(); - glrenderer->SetRenderSettings(Config::GL_BetterPolygons, Config::GL_ScaleFactor); - NDS->GPU.SetRenderer3D(std::move(glrenderer)); - } } // process input and hotkeys @@ -512,7 +493,16 @@ void EmuThread::run() // emulate - u32 nlines = NDS->RunFrame(); + u32 nlines; + if (NDS->GPU.GetRenderer3D().NeedsShaderCompile()) + { + compileShaders(); + nlines = 0; + } + else + { + nlines = NDS->RunFrame(); + } if (ROMManager::NDSSave) ROMManager::NDSSave->CheckFlush(); @@ -750,3 +740,53 @@ bool EmuThread::emuIsActive() { return (RunningSomething == 1); } + +void EmuThread::updateRenderer() +{ + if (videoRenderer != lastVideoRenderer) + { + printf("creating renderer %d\n", videoRenderer); + switch (videoRenderer) + { + case renderer3D_Software: + NDS->GPU.SetRenderer3D(std::make_unique()); + break; + case renderer3D_OpenGL: + NDS->GPU.SetRenderer3D(GLRenderer::New()); + break; + case renderer3D_OpenGLCompute: + NDS->GPU.SetRenderer3D(ComputeRenderer::New()); + break; + default: __builtin_unreachable(); + } + } + lastVideoRenderer = videoRenderer; + + switch (videoRenderer) + { + case renderer3D_Software: + static_cast(NDS->GPU.GetRenderer3D()).SetThreaded(Config::Threaded3D, NDS->GPU); + break; + case renderer3D_OpenGL: + static_cast(NDS->GPU.GetRenderer3D()).SetRenderSettings(Config::GL_BetterPolygons, Config::GL_ScaleFactor); + break; + case renderer3D_OpenGLCompute: + static_cast(NDS->GPU.GetRenderer3D()).SetRenderSettings(Config::GL_ScaleFactor, Config::GL_HiresCoordinates); + break; + default: __builtin_unreachable(); + } +} + +void EmuThread::compileShaders() +{ + int currentShader, shadersCount; + u64 startTime = SDL_GetPerformanceCounter(); + // kind of hacky to look at the wallclock, though it is easier than + // than disabling vsync + do + { + NDS->GPU.GetRenderer3D().ShaderCompileStep(currentShader, shadersCount); + } while (NDS->GPU.GetRenderer3D().NeedsShaderCompile() && + (SDL_GetPerformanceCounter() - startTime) * perfCountsSec < 1.0 / 6.0); + mainWindow->osdAddMessage(0, "Compiling shader %d/%d", currentShader+1, shadersCount); +} diff --git a/src/frontend/qt_sdl/EmuThread.h b/src/frontend/qt_sdl/EmuThread.h index 4950ebbf62..4b19acf9af 100644 --- a/src/frontend/qt_sdl/EmuThread.h +++ b/src/frontend/qt_sdl/EmuThread.h @@ -94,6 +94,9 @@ class EmuThread : public QThread void syncVolumeLevel(); private: + void updateRenderer(); + void compileShaders(); + std::unique_ptr CreateConsole( std::unique_ptr&& ndscart, std::unique_ptr&& gbacart @@ -127,8 +130,9 @@ class EmuThread : public QThread int autoScreenSizing; - int videoRenderer; - bool videoSettingsDirty; + int lastVideoRenderer = -1; + + double perfCountsSec; }; #endif // EMUTHREAD_H diff --git a/src/frontend/qt_sdl/Screen.cpp b/src/frontend/qt_sdl/Screen.cpp index 732365042d..9174d3dd6d 100644 --- a/src/frontend/qt_sdl/Screen.cpp +++ b/src/frontend/qt_sdl/Screen.cpp @@ -709,19 +709,17 @@ void ScreenPanelGL::initOpenGL() glContext->MakeCurrent(); - OpenGL::BuildShaderProgram(kScreenVS, kScreenFS, screenShaderProgram, "ScreenShader"); - GLuint pid = screenShaderProgram[2]; - glBindAttribLocation(pid, 0, "vPosition"); - glBindAttribLocation(pid, 1, "vTexcoord"); - glBindFragDataLocation(pid, 0, "oColor"); + OpenGL::CompileVertexFragmentProgram(screenShaderProgram, + kScreenVS, kScreenFS, + "ScreenShader", + {{"vPosition", 0}, {"vTexcoord", 1}}, + {{"oColor", 0}}); - OpenGL::LinkShaderProgram(screenShaderProgram); + glUseProgram(screenShaderProgram); + glUniform1i(glGetUniformLocation(screenShaderProgram, "ScreenTex"), 0); - glUseProgram(pid); - glUniform1i(glGetUniformLocation(pid, "ScreenTex"), 0); - - screenShaderScreenSizeULoc = glGetUniformLocation(pid, "uScreenSize"); - screenShaderTransformULoc = glGetUniformLocation(pid, "uTransform"); + screenShaderScreenSizeULoc = glGetUniformLocation(screenShaderProgram, "uScreenSize"); + screenShaderTransformULoc = glGetUniformLocation(screenShaderProgram, "uTransform"); // to prevent bleeding between both parts of the screen // with bilinear filtering enabled @@ -769,21 +767,19 @@ void ScreenPanelGL::initOpenGL() memset(zeroData, 0, sizeof(zeroData)); glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 192, 256, 2, GL_RGBA, GL_UNSIGNED_BYTE, zeroData); + OpenGL::CompileVertexFragmentProgram(osdShader, + kScreenVS_OSD, kScreenFS_OSD, + "OSDShader", + {{"vPosition", 0}}, + {{"oColor", 0}}); - OpenGL::BuildShaderProgram(kScreenVS_OSD, kScreenFS_OSD, osdShader, "OSDShader"); - - pid = osdShader[2]; - glBindAttribLocation(pid, 0, "vPosition"); - glBindFragDataLocation(pid, 0, "oColor"); - - OpenGL::LinkShaderProgram(osdShader); - glUseProgram(pid); - glUniform1i(glGetUniformLocation(pid, "OSDTex"), 0); + glUseProgram(osdShader); + glUniform1i(glGetUniformLocation(osdShader, "OSDTex"), 0); - osdScreenSizeULoc = glGetUniformLocation(pid, "uScreenSize"); - osdPosULoc = glGetUniformLocation(pid, "uOSDPos"); - osdSizeULoc = glGetUniformLocation(pid, "uOSDSize"); - osdScaleFactorULoc = glGetUniformLocation(pid, "uScaleFactor"); + osdScreenSizeULoc = glGetUniformLocation(osdShader, "uScreenSize"); + osdPosULoc = glGetUniformLocation(osdShader, "uOSDPos"); + osdSizeULoc = glGetUniformLocation(osdShader, "uOSDSize"); + osdScaleFactorULoc = glGetUniformLocation(osdShader, "uScaleFactor"); const float osdvertices[6*2] = { @@ -818,8 +814,7 @@ void ScreenPanelGL::deinitOpenGL() glDeleteVertexArrays(1, &screenVertexArray); glDeleteBuffers(1, &screenVertexBuffer); - OpenGL::DeleteShaderProgram(screenShaderProgram); - + glDeleteProgram(screenShaderProgram); for (const auto& [key, tex] : osdTextures) { @@ -830,8 +825,7 @@ void ScreenPanelGL::deinitOpenGL() glDeleteVertexArrays(1, &osdVertexArray); glDeleteBuffers(1, &osdVertexBuffer); - OpenGL::DeleteShaderProgram(osdShader); - + glDeleteProgram(osdShader); glContext->DoneCurrent(); @@ -885,7 +879,7 @@ void ScreenPanelGL::drawScreenGL() glViewport(0, 0, w, h); - glUseProgram(screenShaderProgram[2]); + glUseProgram(screenShaderProgram); glUniform2f(screenShaderScreenSizeULoc, w / factor, h / factor); int frontbuf = emuThread->FrontBuffer; @@ -895,7 +889,7 @@ void ScreenPanelGL::drawScreenGL() if (emuThread->NDS->GPU.GetRenderer3D().Accelerated) { // hardware-accelerated render - static_cast(emuThread->NDS->GPU.GetRenderer3D()).GetCompositor().BindOutputTexture(frontbuf); + emuThread->NDS->GPU.GetRenderer3D().BindOutputTexture(frontbuf); } else #endif @@ -936,7 +930,7 @@ void ScreenPanelGL::drawScreenGL() u32 y = kOSDMargin; - glUseProgram(osdShader[2]); + glUseProgram(osdShader); glUniform2f(osdScreenSizeULoc, w, h); glUniform1f(osdScaleFactorULoc, factor); diff --git a/src/frontend/qt_sdl/Screen.h b/src/frontend/qt_sdl/Screen.h index c2f7fda180..4ef4feca5b 100644 --- a/src/frontend/qt_sdl/Screen.h +++ b/src/frontend/qt_sdl/Screen.h @@ -172,7 +172,7 @@ class ScreenPanelGL : public ScreenPanel GLuint screenVertexBuffer, screenVertexArray; GLuint screenTexture; - GLuint screenShaderProgram[3]; + GLuint screenShaderProgram; GLuint screenShaderTransformULoc, screenShaderScreenSizeULoc; QMutex screenSettingsLock; @@ -181,7 +181,7 @@ class ScreenPanelGL : public ScreenPanel int lastScreenWidth = -1, lastScreenHeight = -1; - GLuint osdShader[3]; + GLuint osdShader; GLint osdScreenSizeULoc, osdPosULoc, osdSizeULoc; GLfloat osdScaleFactorULoc; GLuint osdVertexArray; diff --git a/src/frontend/qt_sdl/VideoSettingsDialog.cpp b/src/frontend/qt_sdl/VideoSettingsDialog.cpp index d5ee44c962..368c5e8769 100644 --- a/src/frontend/qt_sdl/VideoSettingsDialog.cpp +++ b/src/frontend/qt_sdl/VideoSettingsDialog.cpp @@ -23,6 +23,7 @@ #include "types.h" #include "Platform.h" #include "Config.h" +#include "GPU.h" #include "VideoSettingsDialog.h" #include "ui_VideoSettingsDialog.h" @@ -30,11 +31,20 @@ inline bool UsesGL() { - return (Config::ScreenUseGL != 0) || (Config::_3DRenderer != 0); + return (Config::ScreenUseGL != 0) || (Config::_3DRenderer != renderer3D_Software); } VideoSettingsDialog* VideoSettingsDialog::currentDlg = nullptr; +void VideoSettingsDialog::setEnabled() +{ + bool softwareRenderer = Config::_3DRenderer == renderer3D_Software; + ui->cbGLDisplay->setEnabled(softwareRenderer); + ui->cbSoftwareThreaded->setEnabled(softwareRenderer); + ui->cbxGLResolution->setEnabled(!softwareRenderer); + ui->cbBetterPolygons->setEnabled(Config::_3DRenderer == renderer3D_OpenGL); + ui->cbxComputeHiResCoords->setEnabled(Config::_3DRenderer == renderer3D_OpenGLCompute); +} VideoSettingsDialog::VideoSettingsDialog(QWidget* parent) : QDialog(parent), ui(new Ui::VideoSettingsDialog) { @@ -48,10 +58,12 @@ VideoSettingsDialog::VideoSettingsDialog(QWidget* parent) : QDialog(parent), ui( oldSoftThreaded = Config::Threaded3D; oldGLScale = Config::GL_ScaleFactor; oldGLBetterPolygons = Config::GL_BetterPolygons; + oldHiresCoordinates = Config::GL_HiresCoordinates; grp3DRenderer = new QButtonGroup(this); - grp3DRenderer->addButton(ui->rb3DSoftware, 0); - grp3DRenderer->addButton(ui->rb3DOpenGL, 1); + grp3DRenderer->addButton(ui->rb3DSoftware, renderer3D_Software); + grp3DRenderer->addButton(ui->rb3DOpenGL, renderer3D_OpenGL); + grp3DRenderer->addButton(ui->rb3DCompute, renderer3D_OpenGLCompute); #if QT_VERSION < QT_VERSION_CHECK(5, 15, 0) connect(grp3DRenderer, SIGNAL(buttonClicked(int)), this, SLOT(onChange3DRenderer(int))); #else @@ -75,25 +87,13 @@ VideoSettingsDialog::VideoSettingsDialog(QWidget* parent) : QDialog(parent), ui( ui->cbxGLResolution->setCurrentIndex(Config::GL_ScaleFactor-1); ui->cbBetterPolygons->setChecked(Config::GL_BetterPolygons != 0); + ui->cbxComputeHiResCoords->setChecked(Config::GL_HiresCoordinates != 0); if (!Config::ScreenVSync) ui->sbVSyncInterval->setEnabled(false); setVsyncControlEnable(UsesGL()); - if (Config::_3DRenderer == 0) - { - ui->cbGLDisplay->setEnabled(true); - ui->cbSoftwareThreaded->setEnabled(true); - ui->cbxGLResolution->setEnabled(false); - ui->cbBetterPolygons->setEnabled(false); - } - else - { - ui->cbGLDisplay->setEnabled(false); - ui->cbSoftwareThreaded->setEnabled(false); - ui->cbxGLResolution->setEnabled(true); - ui->cbBetterPolygons->setEnabled(true); - } + setEnabled(); } VideoSettingsDialog::~VideoSettingsDialog() @@ -119,6 +119,7 @@ void VideoSettingsDialog::on_VideoSettingsDialog_rejected() Config::Threaded3D = oldSoftThreaded; Config::GL_ScaleFactor = oldGLScale; Config::GL_BetterPolygons = oldGLBetterPolygons; + Config::GL_HiresCoordinates = oldHiresCoordinates; emit updateVideoSettings(old_gl != UsesGL()); @@ -133,31 +134,18 @@ void VideoSettingsDialog::setVsyncControlEnable(bool hasOGL) void VideoSettingsDialog::onChange3DRenderer(int renderer) { - bool old_gl = (Config::ScreenUseGL != 0) || (Config::_3DRenderer != 0); + bool old_gl = UsesGL(); Config::_3DRenderer = renderer; - if (renderer == 0) - { - ui->cbGLDisplay->setEnabled(true); - ui->cbSoftwareThreaded->setEnabled(true); - ui->cbxGLResolution->setEnabled(false); - ui->cbBetterPolygons->setEnabled(false); - } - else - { - ui->cbGLDisplay->setEnabled(false); - ui->cbSoftwareThreaded->setEnabled(false); - ui->cbxGLResolution->setEnabled(true); - ui->cbBetterPolygons->setEnabled(true); - } + setEnabled(); emit updateVideoSettings(old_gl != UsesGL()); } void VideoSettingsDialog::on_cbGLDisplay_stateChanged(int state) { - bool old_gl = (Config::ScreenUseGL != 0) || (Config::_3DRenderer != 0); + bool old_gl = UsesGL(); Config::ScreenUseGL = (state != 0); @@ -205,3 +193,10 @@ void VideoSettingsDialog::on_cbBetterPolygons_stateChanged(int state) emit updateVideoSettings(false); } + +void VideoSettingsDialog::on_cbxComputeHiResCoords_stateChanged(int state) +{ + Config::GL_HiresCoordinates = (state != 0); + + emit updateVideoSettings(false); +} diff --git a/src/frontend/qt_sdl/VideoSettingsDialog.h b/src/frontend/qt_sdl/VideoSettingsDialog.h index 29af8e1592..97e0dbd0d9 100644 --- a/src/frontend/qt_sdl/VideoSettingsDialog.h +++ b/src/frontend/qt_sdl/VideoSettingsDialog.h @@ -65,10 +65,12 @@ private slots: void on_cbxGLResolution_currentIndexChanged(int idx); void on_cbBetterPolygons_stateChanged(int state); + void on_cbxComputeHiResCoords_stateChanged(int state); void on_cbSoftwareThreaded_stateChanged(int state); private: void setVsyncControlEnable(bool hasOGL); + void setEnabled(); Ui::VideoSettingsDialog* ui; @@ -81,6 +83,7 @@ private slots: int oldSoftThreaded; int oldGLScale; int oldGLBetterPolygons; + int oldHiresCoordinates; }; #endif // VIDEOSETTINGSDIALOG_H diff --git a/src/frontend/qt_sdl/VideoSettingsDialog.ui b/src/frontend/qt_sdl/VideoSettingsDialog.ui index 11cfe3d9bd..ff9baf8ff7 100644 --- a/src/frontend/qt_sdl/VideoSettingsDialog.ui +++ b/src/frontend/qt_sdl/VideoSettingsDialog.ui @@ -6,7 +6,7 @@ 0 0 - 408 + 427 262 @@ -24,7 +24,7 @@ QLayout::SetFixedSize - -1 + 6 @@ -39,6 +39,16 @@ + + + + <html><head/><body><p>Enabling this may help reduce distortion on quads and more complex polygons, but may also reduce performance.</p></body></html> + + + Improved polygon splitting + + + @@ -46,13 +56,10 @@ - - - - <html><head/><body><p>Enabling this may help reduce distortion on quads and more complex polygons, but may also reduce performance.</p></body></html> - + + - Improved polygon splitting + Use high resolution coordinates @@ -94,23 +101,7 @@ Display settings - - - - - 0 - 0 - - - - <html><head/><body><p>The interval at which to synchronize to the monitor's refresh rate. Set to 1 for a 60Hz monitor, 2 for 120Hz, ...</p></body></html> - - - VSync interval: - - - - + <html><head/><body><p>The interval at which to synchronize to the monitor's refresh rate. Set to 1 for a 60Hz monitor, 2 for 120Hz, ...</p></body></html> @@ -123,7 +114,7 @@ - + <html><head/><body><p>Use OpenGL to draw the DS screens to the main window. May result in better frame pacing. Mandatory when using the OpenGL 3D renderer.</p></body></html> @@ -133,17 +124,7 @@ - - - - <html><head/><body><p>When using OpenGL, synchronize the video output to your monitor's refresh rate.</p></body></html> - - - VSync - - - - + Qt::Vertical @@ -159,13 +140,39 @@ + + + + <html><head/><body><p>When using OpenGL, synchronize the video output to your monitor's refresh rate.</p></body></html> + + + VSync + + + + + + + + 0 + 0 + + + + <html><head/><body><p>The interval at which to synchronize to the monitor's refresh rate. Set to 1 for a 60Hz monitor, 2 for 120Hz, ...</p></body></html> + + + VSync interval: + + + <html><head/><body><p>The OpenGL renderer may be faster than software and supports graphical enhancements, but is more prone to glitches.</p></body></html> - OpenGL + OpenGL (Classic) @@ -186,6 +193,13 @@ + + + + OpenGL (Compute shader) + + + diff --git a/src/frontend/qt_sdl/Window.cpp b/src/frontend/qt_sdl/Window.cpp index a99546bd9f..536e02195b 100644 --- a/src/frontend/qt_sdl/Window.cpp +++ b/src/frontend/qt_sdl/Window.cpp @@ -2048,6 +2048,7 @@ void MainWindow::onUpdateVideoSettings(bool glchange) connect(emuThread, SIGNAL(windowUpdate()), panel, SLOT(repaint())); } + printf("update video settings\n"); videoSettingsDirty = true; if (glchange) diff --git a/src/frontend/qt_sdl/main.cpp b/src/frontend/qt_sdl/main.cpp index 01ba52c7b1..54ade11931 100644 --- a/src/frontend/qt_sdl/main.cpp +++ b/src/frontend/qt_sdl/main.cpp @@ -175,10 +175,6 @@ bool camStarted[2]; //extern int AspectRatiosNum; - - - - static bool FileExtensionInList(const QString& filename, const QStringList& extensions, Qt::CaseSensitivity cs = Qt::CaseInsensitive) { return std::any_of(extensions.cbegin(), extensions.cend(), [&](const auto& ext) { @@ -339,10 +335,10 @@ int main(int argc, char** argv) if (!Config::Load()) QMessageBox::critical(NULL, "melonDS", "Unable to write to config.\nPlease check the write permissions of the folder you placed melonDS in."); -#define SANITIZE(var, min, max) { var = std::clamp(var, min, max); } +#define SANITIZE(var, min, max) { var = std::clamp(var, min, max); } SANITIZE(Config::ConsoleType, 0, 1); #ifdef OGLRENDERER_ENABLED - SANITIZE(Config::_3DRenderer, 0, 1); // 0 is the software renderer, 1 is the OpenGL renderer + SANITIZE(Config::_3DRenderer, 0, renderer3D_Max); #else SANITIZE(Config::_3DRenderer, 0, 0); #endif