From f1569d58d98c5e6d2d63335d4a25d6fb8a3e4d66 Mon Sep 17 00:00:00 2001
From: Matthew Douglas <38992547+matthewdouglas@users.noreply.github.com>
Date: Mon, 12 May 2025 16:15:57 -0400
Subject: [PATCH 1/5] Improvements for testing suite

---
 tests/test_functional.py | 46 +++++++++++++++++++++-------------------
 tests/test_ops.py        |  8 +++++--
 2 files changed, 30 insertions(+), 24 deletions(-)

diff --git a/tests/test_functional.py b/tests/test_functional.py
index c8a390733..96e77e4f4 100644
--- a/tests/test_functional.py
+++ b/tests/test_functional.py
@@ -94,7 +94,11 @@ class Test8BitBlockwiseQuantizeFunctional:
     @pytest.mark.parametrize("blocksize", [4096, 2048, 1024, 512, 256, 128, 64])
     @pytest.mark.parametrize("signed", TRUE_FALSE, ids=id_formatter("signed"))
     def test_dynamic_blockwise_quantization(self, device, dtype, nested, blocksize, signed):
+        iters = 100
+
         if device == "cpu":
+            iters = 10
+
             # This test is slow on CPU, so avoid atypical use cases.
             if nested:
                 pytest.skip("Not a typical use case.")
@@ -106,7 +110,7 @@ def test_dynamic_blockwise_quantization(self, device, dtype, nested, blocksize,
 
         diffs = []
         reldiffs = []
-        for i in range(100):
+        for i in range(iters):
             A1 = torch.randn(1024, 1024, device=device, dtype=dtype)
             C, S = F.quantize_blockwise(A1, blocksize=blocksize, nested=nested)
             A2 = F.dequantize_blockwise(C, S)
@@ -116,15 +120,13 @@ def test_dynamic_blockwise_quantization(self, device, dtype, nested, blocksize,
             reldiffs.append(reldiff.mean().item())
         abserr = sum(diffs) / len(diffs)
         relerr = sum(reldiffs) / len(reldiffs)
-        # print('nested=', nested, 'randn', blocksize, 'dtype', dtype, sum(diffs)/len(diffs))
-        # print('nested=', nested, 'randn', blocksize, 'dtype', dtype, sum(reldiffs)/len(reldiffs))
         assert abserr < 0.011
         assert relerr < 0.018
         assert A2.dtype == dtype
 
         diffs = []
         code = F.create_dynamic_map(signed=signed)
-        for i in range(100):
+        for i in range(iters):
             A1 = torch.rand(1024, 1024, device=device, dtype=dtype)
             C, S = F.quantize_blockwise(A1, blocksize=blocksize, nested=nested, code=code)
             A2 = F.dequantize_blockwise(C, S)
@@ -142,29 +144,29 @@ def test_dynamic_blockwise_quantization(self, device, dtype, nested, blocksize,
             assert abserr < 0.00175
             assert relerr < 0.012
         assert A2.dtype == dtype
-        # print('signed=', signed, 'nested=', nested, 'rand', blocksize, sum(diffs)/len(diffs))
-        # print('signed=', signed, 'nested=', nested, 'rand', blocksize, sum(reldiffs)/len(reldiffs))
 
-    def test_blockwise_cpu_large(self):
+    @pytest.mark.skipif("cpu" not in get_available_devices(), reason="CPU is required")
+    @pytest.mark.parametrize("hidden", [128])
+    @pytest.mark.parametrize("blocksize", [4096, 16384])
+    def test_blockwise_cpu_large(self, hidden, blocksize):
         diffs = []
         reldiffs = []
         batch = 128
         seq = 128
-        for hidden in [128]:  # , 14336]:
-            for blocksize in [4096, 16384]:
-                for i in range(2):
-                    A1 = torch.randn(batch, seq, hidden, device="cpu")
-                    t0 = time.time()
-                    C, S = F.quantize_blockwise(A1, blocksize=blocksize)
-                    A2 = F.dequantize_blockwise(C, S, blocksize=blocksize)
-                    print(time.time() - t0)
-                    diff = torch.abs(A1 - A2)
-                    reldiff = diff / torch.abs(A1 + 1e-8)
-                    diffs.append(diff.mean().item())
-                    reldiffs.append(reldiff.mean().item())
-                    assert diffs[-1] < 0.011
-                # print(sum(diffs)/len(diffs))
-                # print(sum(reldiffs)/len(reldiffs))
+
+        for i in range(2):
+            A1 = torch.randn(batch, seq, hidden, device="cpu")
+            t0 = time.time()
+            C, S = F.quantize_blockwise(A1, blocksize=blocksize)
+            A2 = F.dequantize_blockwise(C, S, blocksize=blocksize)
+            print(time.time() - t0)
+            diff = torch.abs(A1 - A2)
+            reldiff = diff / torch.abs(A1 + 1e-8)
+            diffs.append(diff.mean().item())
+            reldiffs.append(reldiff.mean().item())
+            assert diffs[-1] < 0.011
+        # print(sum(diffs)/len(diffs))
+        # print(sum(reldiffs)/len(reldiffs))
 
     @pytest.mark.parametrize("device", get_available_devices())
     @pytest.mark.parametrize("bits", range(2, 9), ids=id_formatter("bits"))
diff --git a/tests/test_ops.py b/tests/test_ops.py
index ea448f99b..4da1663f0 100644
--- a/tests/test_ops.py
+++ b/tests/test_ops.py
@@ -97,8 +97,12 @@ class TestInt8BlockwiseQuantOps:
     @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32], ids=id_formatter("dtype"))
     @pytest.mark.parametrize("blocksize", [64, 128, 256, 512])
     def test_quantize_blockwise(self, device, dtype, blocksize):
-        if device == "cpu" and dtype != torch.float32:
-            pytest.skip("CPU implementation is only available for float32")
+        if device == "cpu":
+            if dtype != torch.float32:
+                pytest.skip("CPU implementation is only available for float32")
+
+            if blocksize != 256:
+                pytest.skip("CPU implementation is slow; only test blocksize=256")
 
         code = bitsandbytes.functional.create_dynamic_map().to(device)
         A = torch.randn(1024, 1024, dtype=dtype, device=device)

From d4025e027910c7ffc7516dca644e64f933d7ad9c Mon Sep 17 00:00:00 2001
From: Matthew Douglas <38992547+matthewdouglas@users.noreply.github.com>
Date: Mon, 12 May 2025 17:47:48 -0400
Subject: [PATCH 2/5] Add workflow for macOS arm64 CPU tests

---
 .github/workflows/tests.yml | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index e930bd455..5bb8329f7 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -15,7 +15,7 @@ jobs:
   build-cpu:
     strategy:
       matrix:
-        os: [ubuntu-22.04, ubuntu-22.04-arm, windows-2025]
+        os: [ubuntu-22.04, ubuntu-22.04-arm, windows-2025, macos-15]
         include:
           - os: ubuntu-22.04
             arch: x86_64
@@ -23,6 +23,8 @@ jobs:
             arch: aarch64
           - os: windows-2025
             arch: x86_64
+          - os: macos-15
+            arch: arm64
     runs-on: ${{ matrix.os }}
     steps:
       - uses: actions/checkout@v4
@@ -97,7 +99,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        os: [ubuntu-22.04, ubuntu-22.04-arm, windows-2025]
+        os: [ubuntu-22.04, ubuntu-22.04-arm, windows-2025, macos-15]
         torch_version: ["2.7.0"]
         include:
           - os: ubuntu-22.04
@@ -106,6 +108,8 @@ jobs:
             arch: aarch64
           - os: windows-2025
             arch: x86_64
+          - os: macos-15
+            arch: arm64
     runs-on: ${{ matrix.os }}
     env:
       BNB_TEST_DEVICE: cpu

From 5b12eb4bfad64e815f8d970a0ed95fafc0527470 Mon Sep 17 00:00:00 2001
From: Matthew Douglas <38992547+matthewdouglas@users.noreply.github.com>
Date: Wed, 14 May 2025 11:30:59 -0400
Subject: [PATCH 3/5] Update tests.yml

---
 .github/workflows/tests.yml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index 5bb8329f7..9446e3c2b 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -77,14 +77,14 @@ jobs:
         if: startsWith(matrix.os, 'windows')
         uses: ilammy/msvc-dev-cmd@v1.13.0 # to use cl
 
-      # We're running on T4 only for now, so we only target sm75.
+      # Target just the architectures used for our runners.
       - name: Build C++ / CUDA
         run: bash .github/scripts/build-cuda.sh
         env:
           build_os: ${{ matrix.os }}
           build_arch: ${{ matrix.arch }}
           cuda_version: ${{ matrix.cuda_version }}
-          cuda_targets: "75"
+          cuda_targets: "75;89"
 
       - name: Upload build artifact
         uses: actions/upload-artifact@v4
@@ -162,7 +162,7 @@ jobs:
           - os: windows-2025
             cuda_version: "12.8.1"
     runs-on:
-      labels: ${{ contains(matrix.os, 'windows') && 'CUDA-Windows-x64' || 'CUDA-Linux-x64' }}
+      labels: ${{ contains(matrix.os, 'windows') && 'CUDA-Windows-x64' || 'bandb-aws-g6-4xlarge-plus-use1-public-80' }}
     env:
       BNB_TEST_DEVICE: cuda
     steps:

From d8a4fe29135ae57e6bf8fab3d8e9cd6fe29df9a7 Mon Sep 17 00:00:00 2001
From: Matthew Douglas <38992547+matthewdouglas@users.noreply.github.com>
Date: Wed, 14 May 2025 12:31:04 -0400
Subject: [PATCH 4/5] Update tests.yml

Use new L4 and CPU runners for testing.
---
 .github/workflows/tests.yml | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index 9446e3c2b..f0e281a8f 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -104,13 +104,14 @@ jobs:
         include:
           - os: ubuntu-22.04
             arch: x86_64
+            runner: banb-aws-general-8-plus-use1-public-80
           - os: ubuntu-22.04-arm
             arch: aarch64
           - os: windows-2025
             arch: x86_64
           - os: macos-15
             arch: arm64
-    runs-on: ${{ matrix.os }}
+    runs-on: ${{ matrix.runner || matrix.os }}
     env:
       BNB_TEST_DEVICE: cpu
     steps:
@@ -156,13 +157,22 @@ jobs:
           - cuda_version: "12.8.1"
             torch_version: "2.7.0"
             pypi_index: "https://download.pytorch.org/whl/cu128"
+
+          # L4 runners
+          - os: ubuntu-22.04
+            runner: bandb-aws-g6-4xlarge-plus-use1-public-80
+
+          # T4 runners
+          - os: ubuntu-22.04
+            runner: CUDA-Linux-x64
+          - os: windows-2025
+            runner: CUDA-Windows-x64
         exclude:
           # Our current T4 Windows runner has a driver too old (471.11)
           # and cannot support CUDA 12+. Skip for now.
           - os: windows-2025
             cuda_version: "12.8.1"
-    runs-on:
-      labels: ${{ contains(matrix.os, 'windows') && 'CUDA-Windows-x64' || 'bandb-aws-g6-4xlarge-plus-use1-public-80' }}
+    runs-on: ${{ matrix.runner }}
     env:
       BNB_TEST_DEVICE: cuda
     steps:

From 004fb2ff5bbeecac85a25cbef34ad2aa96859cf8 Mon Sep 17 00:00:00 2001
From: Matthew Douglas <38992547+matthewdouglas@users.noreply.github.com>
Date: Wed, 14 May 2025 12:47:51 -0400
Subject: [PATCH 5/5] Update tests.yml

---
 .github/workflows/tests.yml | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index f0e281a8f..9431b32f4 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -149,6 +149,7 @@ jobs:
       matrix:
         os: [ubuntu-22.04, windows-2025]
         arch: [x86_64]
+        gpu: [T4, L4]
         cuda_version: ["11.8.0", "12.8.1"]
         include:
           - cuda_version: "11.8.0"
@@ -160,18 +161,25 @@ jobs:
 
           # L4 runners
           - os: ubuntu-22.04
+            gpu: L4
             runner: bandb-aws-g6-4xlarge-plus-use1-public-80
 
           # T4 runners
           - os: ubuntu-22.04
+            gpu: T4
             runner: CUDA-Linux-x64
           - os: windows-2025
+            gpu: T4
             runner: CUDA-Windows-x64
         exclude:
           # Our current T4 Windows runner has a driver too old (471.11)
           # and cannot support CUDA 12+. Skip for now.
           - os: windows-2025
             cuda_version: "12.8.1"
+
+          # No Windows L4 runners.
+          - os: windows-2025
+            gpu: L4
     runs-on: ${{ matrix.runner }}
     env:
       BNB_TEST_DEVICE: cuda