From 9b99173c9fe0a1370fcbb870ddafe7930f96731b Mon Sep 17 00:00:00 2001
From: Ali Hassani <68103095+alihassanijr@users.noreply.github.com>
Date: Fri, 8 Mar 2024 00:35:03 -0500
Subject: [PATCH] Fused neighborhood attention (#111)
* Fused neighborhood attention (FNA) kernels (forward pass only for now)
* 1D, 2D and 3D Neighborhood Attention are supported,
* Causal neighborhood attention is implemented,
* Window (kernel) size, dilation, and causality can be defined
*per-axis*,
* All GPU architectures since Maxwell (SM50) are supported,
* SM50 up to SM70 are SIMT-only, but support both FP16 and FP32,
* SM70 and SM75 target Tensor Cores in FP16, and SIMT-style in FP32, *
SM80 and above target Tensor Cores in FP16, BF16, and FP32.
* Relative positional biases are implemented (not defined for causal
masking yet),
* Memory layout in FNA is different from existing kernels (`[B, *,
heads, dim]` instead of `[B, heads, *, dim]`.)
* Eventually this layout can skip over the permute/explicit reshape step
in the attention module following the QKV projection.
* Naive kernels now implement and allow causal masking,
* Naive kernels (CPU and CUDA) now allow varying parameters (window
size, dilation, causal) across axes,
* Major bug fix in Volta GEMM kernels
* The epilogue was different for Volta, and it slipped through unit
tests,
* Tests are now more aggressive, and the issue has been fixed.
* Minor torch bug fixed
* Streams were not being selected correctly if users set a tensor to a
device other than cuda:0. Thanks to @AdityaKane2001 for discovering it.
* Documentation (finally):
* Better late than never, but finally added more documentation and
reorganized docs under docs/ instead of shoving everything into the
readme.
* So much more that I forgot (in part due to lack of documentation).
---
CHANGELOG.md | 19 +
LICENSE | 34 +
Makefile | 2 +-
README.md | 382 +-
assets/README_pypi.md | 356 +-
assets/cudamemory_dark.png | Bin 172479 -> 0 bytes
assets/cudamemory_light.png | Bin 162966 -> 0 bytes
assets/cudatime_dark.png | Bin 137097 -> 0 bytes
assets/cudatime_light.png | Bin 130284 -> 0 bytes
assets/gemm_vs_naive.png | Bin 436915 -> 0 bytes
csrc/CMakeLists.txt | 2 +
.../natten_autogen/cpu/naive/kernels.h | 476 +-
.../natten_autogen/cuda/fna/dispatch_cm.h | 775 +
.../natten_autogen/cuda/fna/dispatch_device.h | 85 +
.../natten_autogen/cuda/fna/dispatch_dtype.h | 220 +
.../natten_autogen/cuda/fna/interface.h | 38 +
.../include/natten_autogen/cuda/fna/kernels.h | 14094 +++++++++++++
.../cuda/gemm/2d/sm70/dispatch_align.h | 1218 +-
.../cuda/gemm/2d/sm70/dispatch_kernel_size.h | 135 +
.../cuda/gemm/2d/sm70/kernels.h | 2801 ++-
.../cuda/gemm/2d/sm75/dispatch_align.h | 1218 +-
.../cuda/gemm/2d/sm75/dispatch_kernel_size.h | 135 +
.../cuda/gemm/2d/sm75/kernels.h | 2801 ++-
.../cuda/gemm/2d/sm80/dispatch_align.h | 5023 ++++-
.../cuda/gemm/2d/sm80/dispatch_kernel_size.h | 540 +
.../cuda/gemm/2d/sm80/kernels.h | 9760 ++++++++-
.../natten_autogen/cuda/naive/dispatch_cm.h | 1192 ++
.../natten_autogen/cuda/naive/dispatch_di.h | 4216 ----
.../natten_autogen/cuda/naive/dispatch_ks.h | 1516 --
.../natten_autogen/cuda/naive/interface.h | 158 +-
.../natten_autogen/cuda/naive/kernels.h | 16470 +++-------------
csrc/autogen/src/cpu/naive/source_0.cpp | 269 +-
csrc/autogen/src/cpu/naive/source_1.cpp | 279 +-
csrc/autogen/src/cuda/fna/source_0.cu | 2573 +++
csrc/autogen/src/cuda/fna/source_1.cu | 2573 +++
csrc/autogen/src/cuda/fna/source_10.cu | 2573 +++
csrc/autogen/src/cuda/fna/source_11.cu | 2573 +++
csrc/autogen/src/cuda/fna/source_12.cu | 2573 +++
csrc/autogen/src/cuda/fna/source_13.cu | 2573 +++
csrc/autogen/src/cuda/fna/source_14.cu | 2573 +++
csrc/autogen/src/cuda/fna/source_15.cu | 781 +
csrc/autogen/src/cuda/fna/source_2.cu | 2573 +++
csrc/autogen/src/cuda/fna/source_3.cu | 2573 +++
csrc/autogen/src/cuda/fna/source_4.cu | 2573 +++
csrc/autogen/src/cuda/fna/source_5.cu | 2573 +++
csrc/autogen/src/cuda/fna/source_6.cu | 2573 +++
csrc/autogen/src/cuda/fna/source_7.cu | 2573 +++
csrc/autogen/src/cuda/fna/source_8.cu | 2573 +++
csrc/autogen/src/cuda/fna/source_9.cu | 2573 +++
.../autogen/src/cuda/gemm/2d/sm70/source_0.cu | 956 +-
.../autogen/src/cuda/gemm/2d/sm70/source_1.cu | 1167 +-
.../autogen/src/cuda/gemm/2d/sm70/source_2.cu | 1114 +-
.../autogen/src/cuda/gemm/2d/sm70/source_3.cu | 1039 +-
.../autogen/src/cuda/gemm/2d/sm75/source_0.cu | 956 +-
.../autogen/src/cuda/gemm/2d/sm75/source_1.cu | 1167 +-
.../autogen/src/cuda/gemm/2d/sm75/source_2.cu | 1114 +-
.../autogen/src/cuda/gemm/2d/sm75/source_3.cu | 1039 +-
.../autogen/src/cuda/gemm/2d/sm80/source_0.cu | 424 +-
.../autogen/src/cuda/gemm/2d/sm80/source_1.cu | 578 +-
.../src/cuda/gemm/2d/sm80/source_10.cu | 449 +-
.../src/cuda/gemm/2d/sm80/source_11.cu | 603 +-
.../src/cuda/gemm/2d/sm80/source_12.cu | 619 +-
.../src/cuda/gemm/2d/sm80/source_13.cu | 619 +-
.../src/cuda/gemm/2d/sm80/source_14.cu | 465 +-
.../src/cuda/gemm/2d/sm80/source_15.cu | 469 +-
.../src/cuda/gemm/2d/sm80/source_16.cu | 469 +-
.../src/cuda/gemm/2d/sm80/source_17.cu | 469 +-
.../src/cuda/gemm/2d/sm80/source_18.cu | 477 +-
.../src/cuda/gemm/2d/sm80/source_19.cu | 469 +-
.../autogen/src/cuda/gemm/2d/sm80/source_2.cu | 634 +-
.../src/cuda/gemm/2d/sm80/source_20.cu | 469 +-
.../src/cuda/gemm/2d/sm80/source_21.cu | 629 +-
.../src/cuda/gemm/2d/sm80/source_22.cu | 709 +-
.../src/cuda/gemm/2d/sm80/source_23.cu | 619 +-
.../src/cuda/gemm/2d/sm80/source_24.cu | 559 +-
.../src/cuda/gemm/2d/sm80/source_25.cu | 505 +-
.../src/cuda/gemm/2d/sm80/source_26.cu | 469 +-
.../src/cuda/gemm/2d/sm80/source_27.cu | 469 +-
.../src/cuda/gemm/2d/sm80/source_28.cu | 517 +-
.../src/cuda/gemm/2d/sm80/source_29.cu | 469 +-
.../autogen/src/cuda/gemm/2d/sm80/source_3.cu | 634 +-
.../src/cuda/gemm/2d/sm80/source_30.cu | 469 +-
.../src/cuda/gemm/2d/sm80/source_31.cu | 409 -
.../autogen/src/cuda/gemm/2d/sm80/source_4.cu | 440 +-
.../autogen/src/cuda/gemm/2d/sm80/source_5.cu | 484 +-
.../autogen/src/cuda/gemm/2d/sm80/source_6.cu | 484 +-
.../autogen/src/cuda/gemm/2d/sm80/source_7.cu | 452 +-
.../autogen/src/cuda/gemm/2d/sm80/source_8.cu | 484 +-
.../autogen/src/cuda/gemm/2d/sm80/source_9.cu | 484 +-
csrc/autogen/src/cuda/naive/source_0.cu | 12093 ++----------
csrc/autogen/src/cuda/naive/source_1.cu | 12315 ++----------
csrc/include/natten/config.h | 2 +-
csrc/include/natten/cpu/na1d.h | 85 +-
csrc/include/natten/cpu/na2d.h | 93 +-
csrc/include/natten/cpu/na3d.h | 109 +-
.../cpu/naive/inverse_neighborhood_1d.hpp | 99 +-
.../cpu/naive/inverse_neighborhood_2d.hpp | 143 +-
.../cpu/naive/inverse_neighborhood_3d.hpp | 198 +-
.../natten/cpu/naive/natten_cpu_commons.h | 136 +-
.../naive/neighborhood_neighborhood_1d.hpp | 89 +-
.../naive/neighborhood_neighborhood_2d.hpp | 143 +-
.../naive/neighborhood_neighborhood_3d.hpp | 193 +-
.../cpu/naive/pointwise_neighborhood_1d.hpp | 278 +-
.../cpu/naive/pointwise_neighborhood_2d.hpp | 420 +-
.../cpu/naive/pointwise_neighborhood_3d.hpp | 363 +-
.../natten/cpu/naive/rel_pos_bias_1d.hpp | 57 +-
.../natten/cpu/naive/rel_pos_bias_2d.hpp | 82 +-
.../natten/cpu/naive/rel_pos_bias_3d.hpp | 117 +-
.../cuda/fna/epilogue/epilogue_pipelined.h | 638 +
.../fna/epilogue/epilogue_rescale_output.h | 266 +
.../epilogue_thread_apply_logsumexp.h | 181 +
.../fna/epilogue/predicated_tile_iterator.h | 629 +
.../predicated_tile_iterator_params.h | 152 +
csrc/include/natten/cuda/fna/fna_forward.cuh | 183 +
.../include/natten/cuda/fna/gemm/custom_mma.h | 139 +
.../natten/cuda/fna/gemm/custom_mma_base.h | 189 +
.../cuda/fna/gemm/custom_mma_multistage.h | 765 +
.../cuda/fna/gemm/custom_mma_pipelined.h | 408 +
.../natten/cuda/fna/gemm/find_default_mma.h | 272 +
.../cuda/fna/gemm/mma_accum_lambda_iterator.h | 393 +
.../natten/cuda/fna/gemm/mma_from_smem.h | 1966 ++
.../cuda/fna/gemm/replace_mma_iterators.h | 123 +
.../natten/cuda/fna/gemm_kernel_utils.h | 186 +
.../default_warp_iterator_from_smem.h | 149 +
.../epilogue_predicated_tile_iterator.h | 760 +
.../cuda/fna/iterators/make_residual_last.h | 108 +
.../predicated_tile_access_iterator.h | 1662 ++
...cated_tile_access_iterator_residual_last.h | 976 +
.../fna/iterators/predicated_tile_iterator.h | 831 +
.../predicated_tile_iterator_residual_last.h | 617 +
.../fna/iterators/transpose_warp_iterator.h | 60 +
.../fna/iterators/warp_iterator_from_smem.h | 290 +
csrc/include/natten/cuda/fna/kernel_forward.h | 1168 ++
csrc/include/natten/cuda/fna/na_utils.cuh | 1158 ++
.../cuda/fna/transform/tile_smem_loader.h | 95 +
.../natten/cuda/gemm/kernel/default_na.cuh | 21 +-
.../cuda/gemm/kernel/default_na1d_in.cuh | 375 +-
.../cuda/gemm/kernel/default_na1d_nn.cuh | 375 +-
.../cuda/gemm/kernel/default_na1d_pn.cuh | 382 +-
.../cuda/gemm/kernel/default_na2d_in.cuh | 395 +-
.../cuda/gemm/kernel/default_na2d_nn.cuh | 395 +-
.../cuda/gemm/kernel/default_na2d_pn.cuh | 402 +-
csrc/include/natten/cuda/gemm/na1d.cuh | 8 +-
csrc/include/natten/cuda/gemm/na2d.cuh | 17 +-
.../threadblock/default_epilogue_simt.cuh | 167 -
.../default_epilogue_tensor_op.cuh | 173 -
.../na1d_in_output_tile_iterator.cuh | 38 +-
.../na1d_nn_output_tile_iterator.cuh | 38 +-
.../na1d_pn_output_tile_iterator.cuh | 50 +-
.../na2d_in_output_tile_iterator.cuh | 38 +-
.../na2d_nn_output_tile_iterator.cuh | 38 +-
.../na2d_pn_output_tile_iterator.cuh | 50 +-
csrc/include/natten/cuda/na1d.cuh | 196 +-
csrc/include/natten/cuda/na2d.cuh | 251 +-
csrc/include/natten/cuda/na3d.cuh | 228 +-
.../cuda/naive/inverse_neighborhood_1d.cuh | 165 +-
.../cuda/naive/inverse_neighborhood_2d.cuh | 217 +-
.../cuda/naive/inverse_neighborhood_3d.cuh | 319 +-
.../natten/cuda/naive/natten_commons.cuh | 273 +-
.../naive/neighborhood_neighborhood_1d.cuh | 146 +-
.../naive/neighborhood_neighborhood_2d.cuh | 196 +-
.../naive/neighborhood_neighborhood_3d.cuh | 283 +-
.../cuda/naive/pointwise_neighborhood_1d.cuh | 272 +-
.../cuda/naive/pointwise_neighborhood_2d.cuh | 705 +-
.../cuda/naive/pointwise_neighborhood_3d.cuh | 517 +-
.../natten/cuda/naive/rel_pos_bias_1d.cuh | 127 +-
.../natten/cuda/naive/rel_pos_bias_2d.cuh | 172 +-
.../natten/cuda/naive/rel_pos_bias_3d.cuh | 267 +-
csrc/include/natten/cuda/naive/tiled/base.cuh | 105 +-
...wise_neighborhood_2d_tiled_11x11_13x13.cuh | 264 +-
.../pointwise_neighborhood_2d_tiled_3x3.cuh | 208 +-
.../pointwise_neighborhood_2d_tiled_5x5.cuh | 138 +-
...ointwise_neighborhood_2d_tiled_7x7_9x9.cuh | 260 +-
csrc/include/natten/cuda/utils/cuda.h | 35 +
.../cuda/{gemm/utils.cuh => utils/cutlass.h} | 4 +-
csrc/include/natten/naive_argpack.h | 38 +-
csrc/include/natten/natten.h | 130 +
csrc/include/natten/pytorch/cpu/na1d.h | 71 +-
csrc/include/natten/pytorch/cpu/na2d.h | 80 +-
csrc/include/natten/pytorch/cpu/na3d.h | 97 +-
csrc/include/natten/pytorch/cuda/helpers.cuh | 9 +-
csrc/include/natten/pytorch/cuda/na1d.cuh | 71 +-
csrc/include/natten/pytorch/cuda/na2d.cuh | 80 +-
csrc/include/natten/pytorch/cuda/na3d.cuh | 97 +-
csrc/include/natten/pytorch/helpers.h | 149 +-
csrc/include/natten/pytorch/na1d.h | 35 +-
csrc/include/natten/pytorch/na2d.h | 35 +-
csrc/include/natten/pytorch/na3d.h | 43 +-
csrc/natten.cpp | 9 +
csrc/src/pytorch/cpu/na1d.cpp | 91 +-
csrc/src/pytorch/cpu/na2d.cpp | 100 +-
csrc/src/pytorch/cpu/na3d.cpp | 117 +-
csrc/src/pytorch/cuda/na1d.cu | 111 +-
csrc/src/pytorch/cuda/na2d.cu | 121 +-
csrc/src/pytorch/cuda/na3d.cu | 139 +-
csrc/src/pytorch/na1d.cpp | 139 +-
csrc/src/pytorch/na2d.cpp | 153 +-
csrc/src/pytorch/na3d.cpp | 187 +-
docs/README.md | 20 +
docs/api.md | 95 +
docs/assets/batched_gemm_na_dark.png | Bin 0 -> 253611 bytes
docs/assets/batched_gemm_na_light.png | Bin 0 -> 252970 bytes
.../dilated_neighborhood_attn_2d_vis_dark.png | Bin 0 -> 2229500 bytes
...dilated_neighborhood_attn_2d_vis_light.png | Bin 0 -> 2228650 bytes
docs/assets/fna_dark.png | Bin 0 -> 133179 bytes
docs/assets/fna_light.png | Bin 0 -> 128466 bytes
docs/assets/neighborhood_attn_2d_vis_dark.png | Bin 0 -> 2225546 bytes
.../assets/neighborhood_attn_2d_vis_light.png | Bin 0 -> 2224460 bytes
docs/assets/old_natten_dark.png | Bin 0 -> 23076 bytes
docs/assets/old_natten_light.png | Bin 0 -> 23071 bytes
docs/backend.md | 45 +
docs/build.md | 19 +
docs/frontend.md | 349 +
docs/history.md | 50 +
docs/install.md | 140 +
docs/methodology/README.md | 13 +
docs/methodology/bmm.md | 21 +
docs/methodology/fused.md | 19 +
docs/tests.md | 30 +
scripts/autogen_cpu_naive.py | 93 +-
scripts/autogen_cuda_fna.py | 689 +
scripts/autogen_cuda_gemm_1d.py | 158 +-
scripts/autogen_cuda_gemm_2d.py | 2 +-
scripts/autogen_cuda_naive.py | 453 +-
src/natten/__init__.py | 14 +-
src/natten/autotuner.py | 497 +
src/natten/flops.py | 82 +
src/natten/fna.py | 47 +
src/natten/functional.py | 880 +-
src/natten/natten1d.py | 118 +-
src/natten/natten2d.py | 120 +-
src/natten/natten3d.py | 148 +-
src/natten/nested.py | 129 +-
src/natten/utils/__init__.py | 14 +
src/natten/utils/checks.py | 129 +
src/natten/utils/testing.py | 16 +-
tests/test_fna1d.py | 356 +
tests/test_fna2d.py | 379 +
tests/test_fna3d.py | 400 +
tests/test_na1d.py | 421 +-
tests/test_na2d.py | 593 +-
tests/test_na3d.py | 527 +-
tools/profile_1d.py | 44 +-
tools/profile_2d.py | 44 +-
..._2d_with_extra_tokens.py => profile_3d.py} | 81 +-
tools/requirements.txt | 1 +
tools/utils/__init__.py | 13 +-
tools/utils/{utils.py => formatting.py} | 360 +-
tools/utils/mappings.py | 188 +
tools/utils/na_profiler.py | 350 +
tools/utils/ops.py | 57 +
tools/utils/problem.py | 172 +
tools/utils/profiler_1d.py | 180 -
tools/utils/profiler_2d.py | 296 -
254 files changed, 133543 insertions(+), 55740 deletions(-)
delete mode 100644 assets/cudamemory_dark.png
delete mode 100644 assets/cudamemory_light.png
delete mode 100644 assets/cudatime_dark.png
delete mode 100644 assets/cudatime_light.png
delete mode 100644 assets/gemm_vs_naive.png
create mode 100644 csrc/autogen/include/natten_autogen/cuda/fna/dispatch_cm.h
create mode 100644 csrc/autogen/include/natten_autogen/cuda/fna/dispatch_device.h
create mode 100644 csrc/autogen/include/natten_autogen/cuda/fna/dispatch_dtype.h
create mode 100644 csrc/autogen/include/natten_autogen/cuda/fna/interface.h
create mode 100644 csrc/autogen/include/natten_autogen/cuda/fna/kernels.h
create mode 100644 csrc/autogen/include/natten_autogen/cuda/naive/dispatch_cm.h
delete mode 100644 csrc/autogen/include/natten_autogen/cuda/naive/dispatch_di.h
delete mode 100644 csrc/autogen/include/natten_autogen/cuda/naive/dispatch_ks.h
create mode 100644 csrc/autogen/src/cuda/fna/source_0.cu
create mode 100644 csrc/autogen/src/cuda/fna/source_1.cu
create mode 100644 csrc/autogen/src/cuda/fna/source_10.cu
create mode 100644 csrc/autogen/src/cuda/fna/source_11.cu
create mode 100644 csrc/autogen/src/cuda/fna/source_12.cu
create mode 100644 csrc/autogen/src/cuda/fna/source_13.cu
create mode 100644 csrc/autogen/src/cuda/fna/source_14.cu
create mode 100644 csrc/autogen/src/cuda/fna/source_15.cu
create mode 100644 csrc/autogen/src/cuda/fna/source_2.cu
create mode 100644 csrc/autogen/src/cuda/fna/source_3.cu
create mode 100644 csrc/autogen/src/cuda/fna/source_4.cu
create mode 100644 csrc/autogen/src/cuda/fna/source_5.cu
create mode 100644 csrc/autogen/src/cuda/fna/source_6.cu
create mode 100644 csrc/autogen/src/cuda/fna/source_7.cu
create mode 100644 csrc/autogen/src/cuda/fna/source_8.cu
create mode 100644 csrc/autogen/src/cuda/fna/source_9.cu
create mode 100644 csrc/include/natten/cuda/fna/epilogue/epilogue_pipelined.h
create mode 100644 csrc/include/natten/cuda/fna/epilogue/epilogue_rescale_output.h
create mode 100644 csrc/include/natten/cuda/fna/epilogue/epilogue_thread_apply_logsumexp.h
create mode 100644 csrc/include/natten/cuda/fna/epilogue/predicated_tile_iterator.h
create mode 100644 csrc/include/natten/cuda/fna/epilogue/predicated_tile_iterator_params.h
create mode 100644 csrc/include/natten/cuda/fna/fna_forward.cuh
create mode 100644 csrc/include/natten/cuda/fna/gemm/custom_mma.h
create mode 100644 csrc/include/natten/cuda/fna/gemm/custom_mma_base.h
create mode 100644 csrc/include/natten/cuda/fna/gemm/custom_mma_multistage.h
create mode 100644 csrc/include/natten/cuda/fna/gemm/custom_mma_pipelined.h
create mode 100644 csrc/include/natten/cuda/fna/gemm/find_default_mma.h
create mode 100644 csrc/include/natten/cuda/fna/gemm/mma_accum_lambda_iterator.h
create mode 100644 csrc/include/natten/cuda/fna/gemm/mma_from_smem.h
create mode 100644 csrc/include/natten/cuda/fna/gemm/replace_mma_iterators.h
create mode 100644 csrc/include/natten/cuda/fna/gemm_kernel_utils.h
create mode 100644 csrc/include/natten/cuda/fna/iterators/default_warp_iterator_from_smem.h
create mode 100644 csrc/include/natten/cuda/fna/iterators/epilogue_predicated_tile_iterator.h
create mode 100644 csrc/include/natten/cuda/fna/iterators/make_residual_last.h
create mode 100644 csrc/include/natten/cuda/fna/iterators/predicated_tile_access_iterator.h
create mode 100644 csrc/include/natten/cuda/fna/iterators/predicated_tile_access_iterator_residual_last.h
create mode 100644 csrc/include/natten/cuda/fna/iterators/predicated_tile_iterator.h
create mode 100644 csrc/include/natten/cuda/fna/iterators/predicated_tile_iterator_residual_last.h
create mode 100644 csrc/include/natten/cuda/fna/iterators/transpose_warp_iterator.h
create mode 100644 csrc/include/natten/cuda/fna/iterators/warp_iterator_from_smem.h
create mode 100644 csrc/include/natten/cuda/fna/kernel_forward.h
create mode 100644 csrc/include/natten/cuda/fna/na_utils.cuh
create mode 100644 csrc/include/natten/cuda/fna/transform/tile_smem_loader.h
delete mode 100644 csrc/include/natten/cuda/gemm/threadblock/default_epilogue_simt.cuh
delete mode 100644 csrc/include/natten/cuda/gemm/threadblock/default_epilogue_tensor_op.cuh
create mode 100644 csrc/include/natten/cuda/utils/cuda.h
rename csrc/include/natten/cuda/{gemm/utils.cuh => utils/cutlass.h} (97%)
create mode 100644 csrc/include/natten/natten.h
create mode 100644 docs/README.md
create mode 100644 docs/api.md
create mode 100644 docs/assets/batched_gemm_na_dark.png
create mode 100644 docs/assets/batched_gemm_na_light.png
create mode 100644 docs/assets/dilated_neighborhood_attn_2d_vis_dark.png
create mode 100644 docs/assets/dilated_neighborhood_attn_2d_vis_light.png
create mode 100644 docs/assets/fna_dark.png
create mode 100644 docs/assets/fna_light.png
create mode 100644 docs/assets/neighborhood_attn_2d_vis_dark.png
create mode 100644 docs/assets/neighborhood_attn_2d_vis_light.png
create mode 100644 docs/assets/old_natten_dark.png
create mode 100644 docs/assets/old_natten_light.png
create mode 100644 docs/backend.md
create mode 100644 docs/build.md
create mode 100644 docs/frontend.md
create mode 100644 docs/history.md
create mode 100644 docs/install.md
create mode 100644 docs/methodology/README.md
create mode 100644 docs/methodology/bmm.md
create mode 100644 docs/methodology/fused.md
create mode 100644 docs/tests.md
create mode 100644 scripts/autogen_cuda_fna.py
create mode 100644 src/natten/autotuner.py
create mode 100644 src/natten/fna.py
create mode 100644 src/natten/utils/checks.py
create mode 100644 tests/test_fna1d.py
create mode 100644 tests/test_fna2d.py
create mode 100644 tests/test_fna3d.py
rename tools/{profile_2d_with_extra_tokens.py => profile_3d.py} (64%)
rename tools/utils/{utils.py => formatting.py} (55%)
create mode 100644 tools/utils/mappings.py
create mode 100644 tools/utils/na_profiler.py
create mode 100644 tools/utils/ops.py
create mode 100644 tools/utils/problem.py
delete mode 100644 tools/utils/profiler_1d.py
delete mode 100644 tools/utils/profiler_2d.py
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 05b1b96..4128d7c 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,5 +1,24 @@
# Changelog
+## [Main branch]
+* Fused neighborhood attention (FNA) kernels (forward pass only for now)
+ * 1D, 2D and 3D Neighborhood Attention are supported,
+ * Causal neighborhood attention is implemented,
+ * Window (kernel) size, dilation, and causality can be defined *per-axis*,
+ * All GPU architectures since Maxwell (SM50) are supported,
+ * SM50 up to SM70 are SIMT-only, but support both FP16 and FP32,
+ * SM70 and SM75 target Tensor Cores in FP16, and SIMT-style in FP32,
+ * SM80 and above target Tensor Cores in FP16, BF16, and FP32.
+ * Relative positional biases are implemented (not defined for causal masking yet),
+ * Memory layout in FNA is different from existing kernels (`[B, *, heads, dim]` instead of `[B, heads, *, dim]`.)
+ * Eventually this layout can skip over the permute/explicit reshape step in the attention module following
+ the QKV projection.
+* Naive kernels now implement and allow causal masking,
+* Naive kernels (CPU and CUDA) now allow varying parameters (window size, dilation, causal) across axes,
+* Major bug fix in Volta GEMM kernels
+ * The epilogue was different for Volta, and it slipped through unit tests,
+ * Tests are now more aggressive, and the issue has been fixed.
+
## [0.15.1] - 2024-01-24
* Attention tensors can now be views, which allows combining neighborhood and any other attention pattern (i.e. registers,
cross attention tokens, and the like) without extra copies. ([#85](https://github.com/SHI-Labs/NATTEN/pull/85) and [#87](https://github.com/SHI-Labs/NATTEN/pull/87)).
diff --git a/LICENSE b/LICENSE
index 7c099a3..22378b5 100644
--- a/LICENSE
+++ b/LICENSE
@@ -19,3 +19,37 @@ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
+
+Fused Neighborhood Attention kernels are heavily based on the memory-efficient
+attention kernels from the xFormers project by Meta Platforms, Inc.
+
+Copyright (c) Facebook, Inc. and its affiliates
+
+BSD 3-Clause License
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+
+3. Neither the names of Facebook, Deepmind Technologies, NYU, NEC Laboratories America
+ and IDIAP Research Institute nor the names of its contributors may be
+ used to endorse or promote products derived from this software without
+ specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
diff --git a/Makefile b/Makefile
index a441400..b3a3e50 100644
--- a/Makefile
+++ b/Makefile
@@ -56,7 +56,7 @@ install:
NATTEN_CUDA_ARCH="${CUDA_ARCH}" NATTEN_N_WORKERS="${WORKERS}" NATTEN_VERBOSE="${VERBOSE}" pip install -v -e . 2>&1 | tee install.out
test:
- pytest -v -x ./tests
+ PYTORCH_NO_CUDA_MEMORY_CACHING=1 pytest -v -x ./tests
style:
ufmt format $(check_dirs)
diff --git a/README.md b/README.md
index b2ab7dd..ea08ee0 100644
--- a/README.md
+++ b/README.md
@@ -1,325 +1,117 @@
![NATTENLogo](assets/natten_dark.png#gh-dark-mode-only) ![NATTENLogo](assets/natten_light.png#gh-light-mode-only)
+| [Documentation](docs/)
*Neighborhood Attention Extension*
Bringing attention to a neighborhood near you!
+
+
+
+
+
NATTEN is an open-source project dedicated to providing fast implementations for
-[Neighborhood Attention](https://scholar.google.com/citations?view_op=view_citation&citation_for_view=Ndu0dUcAAAAJ:b0M2c_1WBrUC),
+[Neighborhood Attention](https://openaccess.thecvf.com/content/CVPR2023/html/Hassani_Neighborhood_Attention_Transformer_CVPR_2023_paper.html),
a sliding window self-attention mechanism.
-If you're not familiar with neighborhood attention, we recommend referring to
-[our papers](https://github.com/SHI-Labs/Neighborhood-Attention-Transformer), or watching our
+If you're not familiar with neighborhood attention, please refer to
+[our papers](https://github.com/SHI-Labs/Neighborhood-Attention-Transformer), or watch our
[YouTube video](https://www.youtube.com/watch?v=Ya4BfioxIHA) from CVPR 2023.
-NATTEN is primarily a C++/CUDA library, which has so far only supported binding with the torch API, and therefore is mostly
-usable through PyTorch. We plan to eliminate the torch dependency in the future and possibly support other frameworks /
-engines.
-
-NATTEN's python interface provides Neighborhood Attention (local attention)
-and Dilated Neighborhood Attention
-(sparse global attention, a.k.a. dilated local attention) as autograd-compatible PyTorch modules for both 1D, 2D, and 3D data.
-
-It also has experimental support for
-[forward mode automatic differentiation](https://pytorch.org/tutorials/intermediate/forward_ad_usage.html),
-and nested tensors.
-
-## CPU support
-Our CPU implementations are very limited and barely performance-optimized.
-While we aim to provide the best implementation for different devices, optimizing our CUDA kernels is higher up on the list of
-priorities. Contributions are always welcomed.
-
-
-## CUDA support
-NATTEN generally supports all architectures supported by PyTorch. More specifically, architectures since Kepler (SM35) are
-supported. However, our most-performant kernels only support architectures since Volta, targeting tensor core math.
-
-### Half-precision support
-Devices with compute capability greater than or equal to 6.0 (Pascal and later) allow running in FP16.
-
-Devices with compute capability greater than or equal to 8.0 (Ampere and later) allow running in BF16.
-
-### Naive kernels
-NATTEN provides more than one set of kernel. Our naive kernels, which were developed during the first phase of the project,
-provide a very basic implementation of neighborhood attention, and are the last resort for every problem. This means that if
-your device and software support alternatives to our naive kernels (i.e. GEMM kernels), and your problem size is supported,
-NATTEN will automatically pick the better kernels for you. (NOTE: this is done based on the class of kernels and not by actual
-performance via profiling.)
-
-Naive kernels are always usable across different architectures, though not the most performant.
-
-### Tiled kernels
-Naive kernels for the 2-dimensional neighborhood attention also come with a tiled implementation for one of the three
-underlying operations, which is considerably more performant than the original. However, the tiled kernels only support problem
-sizes with head dim 32, and up to kernel size 13x13.
-Tiled kernels are also not supported in devices with compute capability smaller than 6.0.
-
-### GEMM kernels.
-
-Our GEMM-based kernels depend on and are modeled after
-[CUTLASS](https://github.com/NVIDIA/cutlass/)'s [Implicit GEMM](https://github.com/NVIDIA/cutlass/blob/main/media/docs/implicit_gemm_convolution.md)
-kernels for convolution.
-
-Devices with compute capability greater than or equal to 7.0 (Volta, Turing, Ampere, Ada Lovelace, Hopper), can run our GEMM
-kernels, which are somewhat performance-optimized, thanks to the underlying mainloop from CUTLASS, and target Tensor Core math.
-
-However, do note that their current float16/bfloat16 implementations do not typically result in improved latency,
-due to a memory alignment issue, which we aim to resolve in future kernels.
-
-Devices with compute capability greater than or equal to 8.0 (Ampere and later) support GEMM kernels with double, full, and
-half precision (FP64, FP32, FP16, BF16).
-
-Devices with compute capability 7.0 or 7.5 (Volta and Turing) only support GEMM kernels with half precision (FP16). This is
-because their tensor cores only allow FP16 math.
-
-![GEMMvsNaive](assets/gemm_vs_naive.png)
-
-NOTE: the table presents the average improvement in latency over different problem sizes with full precision (tfloat32).
-
-### How do I check my compute capability / architecture?
-Simple, just Google your GPU model, and check its compute capability.
-If you've already set up NATTEN, you could also run:
-```python
-from natten.functional import get_device_cc
-
-cc = get_device_cc()
-cc = get_device_cc(0) # Optionally select a specific GPU
-
-print(f"Your device is SM{cc}.")
-```
-
-### How do I know if I'm using the new kernels?
-The new NATTEN library sets up constants that are binded to the python interface, which will allow you to
-check whether you've compiled with: a. CUDA, b. Float16 (half) support, c. Bfloat16 support, d. New GEMM kernels.
-
-```python
-import natten
-
-# Whether NATTEN was built with CUDA kernels,
-# and supports running them on this system.
-print(natten.has_cuda())
-
-# Whether NATTEN supports running float16 on
-# the selected device.
-print(natten.has_half())
-print(natten.has_half(0)) # Optionally specify a GPU index.
-
-# Whether NATTEN supports running bfloat16 on
-# the selected device.
-print(natten.has_bfloat())
-print(natten.has_bfloat(0)) # Optionally specify a GPU index.
-
-# Whether NATTEN supports running GEMM kernels
-# on the selected device.
-print(natten.has_gemm())
-print(natten.has_gemm(0)) # Optionally specify a GPU index.
-
-# Whether NATTEN supports running GEMM kernels
-# in full precision on the selected device.
-print(natten.has_fp32_gemm())
-print(natten.has_fp32_gemm(0)) # Optionally specify a GPU index.
-```
-
-If `natten.has_gemm()` returns true, by default NATTEN will call the faster GEMM kernels instead of the original naive kernels
-for both NA1D and NA2D. 3D Neighborhood attention is not supported at this time, but you can still use the naive kernels.
-
-In addition, we will be adding scripts that allow you to profile and observe latency from the kernels with those options
-available.
-
-## About NATTEN
-Sliding window self attention mechanisms have been relatively overlooked, in part due to implementation difficulties.
-For example, in a paper proposing one of the earliest examples of such methods,
-[SASA](https://proceedings.neurips.cc/paper/2019/file/3416a75f4cea9109507cacd8e2f2aefc-Paper.pdf),
-it was noted that
-although such methods are theoretically efficient, they're relatively slow in practice, compared to convolutions,
-which have been implemented in most well-known deep learning libraries.
-
-That is why we started developing NATTEN, an extension to existing libraries with efficient implementations of sliding window
-attention mechanisms, which will enable research in this direction including building powerful hierarchical vision
-transformers.
-
-For more information, we highly recommend reading our preprints [NAT](https://arxiv.org/abs/2204.07143) and
-[DiNAT](https://arxiv.org/abs/2209.15001), and check out their [repository](https://github.com/SHI-Labs/Neighborhood-Attention-Transformer).
-
-## Requirements
-
-* python >= 3.8
-* torch >= 2.0
+## Getting started
-NATTEN supports PyTorch version 2.0 and later, and Python versions 3.7, 3.8, 3.9, 3.10(only torch >= 1.11), and 3.11 (only torch >= 1.13).
+NATTEN supports PyTorch version 2.0 and later, and Python versions 3.8 and above.
+Python 3.12 is only supported with torch >= 2.2.0.
Older NATTEN releases supported python >= 3.7 and torch >= 1.8.
-**NOTE:** NATTEN only comes with pre-built Linux wheels, and supports Kepler and above (`SM >= 35`).
-Make sure your GPU is supported by referring to
-[this webpage](https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/).
-Future versions will extend support to older GPUs.
-
-## Getting started
-
-### Linux
-Just refer to our website, [shi-labs.com/natten](https://www.shi-labs.com/natten/), select your PyTorch version and the CUDA
-version it was compiled with, copy-paste the command and install in seconds!
-
-For example, if you're on `torch==2.0.0+cu118`, you should install NATTEN using the following wheel:
-```bash
-pip3 install natten -f https://shi-labs.com/natten/wheels/cu118/torch2.0.0/index.html
-```
-
-More generally:
-```bash
-pip3 install natten -f https://shi-labs.com/natten/wheels/{cu_version}/torch{torch_version}/index.html
-```
-
-**NOTE:** If you do not specify a wheel URL, pip will collect NATTEN and try to compile on locally, which depending
-on your system might take up to 30 minutes.
-We strongly recommend using our website if you're a Linux user.
-
-### Mac
-Unfortunately we are not yet able to build Mac wheels (and do not yet have a Metal backend). However, you can compile upon
-installing and use the CPU kernels:
-
-```bash
-pip3 install natten
-```
-
-### Windows
-The current release has not been successfully built Windows devices with CUDA, and therefore does not yet have Windows wheels.
-If you are a windows user willing to help us figure out building with MSVC, please contact us or open an issue.
-
-### Build from source
-Once you've set up your Python environment and installed PyTorch with CUDA, simply clone and build:
-
-```bash
-git clone https://github.com/SHI-Labs/NATTEN
-cd NATTEN
-
-pip install -r requirements.txt
-
-make
-```
-
-NOTE: NATTEN will use the PyTorch API to detect your GPU architecture, and will by default attempt to use 1/4th of the number
-of processes your system allows to build. You can override them by passing in the following arguments:
-```bash
-# Build with 2 workers/processes
-make WORKERS=2
-
-# Build targeting SM89 (Ada Lovelace)
-make CUDA_ARCH="8.9"
-```
-
-Please also note that building with the latest GEMM kernels can be a bit time consuming, which means at least 10 - 20 minutes
-given that you use enough workers. It is technically possible to improve build time by generating more source files and using
-more workers (at the expense of generating a larger binary), but that option will be made available in the future.
-
-#### Optional: run unit tests
-You can optionally run unit tests to verify building from source finished successfully:
-
-```bash
-make test
-```
-
-
-## Catalog
-- [x] Neighborhood Attention 1D (CPU, naive)
-- [x] Neighborhood Attention 2D (CPU, naive)
-- [x] Neighborhood Attention 3D (CPU, naive)
-- [x] Neighborhood Attention 1D (CUDA, naive)
-- [x] Neighborhood Attention 2D (CUDA, naive)
-- [x] Neighborhood Attention 3D (CUDA, naive)
-- [x] Neighborhood Attention 1D (CUDA, gemm-based, SM70 and above)
-- [x] Neighborhood Attention 2D (CUDA, gemm-based, SM70 and above)
-- [x] Dilation support
-- [x] Float16 support
-- [x] BFloat16 support
-- [x] Kepler and Maxwell (30<=SM<60) support
-- [ ] Windows builds
-- [ ] Neighborhood Attention 1D (CUDA, fused kernels)
-- [ ] Neighborhood Attention 2D (CUDA, fused kernels)
-
-## Usage
-Simply import `NeighborhoodAttention1D`, `NeighborhoodAttention2D`, or `NeighborhoodAttention3D` from `natten`:
-```python
-from natten import NeighborhoodAttention1D
-from natten import NeighborhoodAttention2D
-from natten import NeighborhoodAttention3D
-
-na1d = NeighborhoodAttention1D(dim=128, kernel_size=7, dilation=2, num_heads=4)
-na2d = NeighborhoodAttention2D(dim=128, kernel_size=7, dilation=2, num_heads=4)
-na3d = NeighborhoodAttention3D(dim=128, kernel_size=7, dilation=2, num_heads=4)
-```
-
-NA3D also supports different kernel size and dilation values for depth:
-```python
-na3d = NeighborhoodAttention3D(
- dim=128,
- kernel_size=7,
- kernel_size_d=5,
- dilation=2,
- dilation_d=3,
- num_heads=4)
-```
-
-Modules expect inputs of shape `[batch_size, *, dim]`:
-* NA1D: `[batch_size, sequence_length, dim]`
-* NA2D: `[batch_size, height, width, dim]`
-* NA3D: `[batch_size, depth, height, width, dim]`
-
-
-### FLOPs
-We recommend counting flops through [fvcore](https://github.com/facebookresearch/fvcore).
-
-```shell
-pip install fvcore
-```
-
-Once you have fvcore installed, you can directly use our dedicated FLOP counter:
-```python
-from natten.flops import get_flops
-
-flops = get_flops(model, input)
-```
-
-Alternatively, if you are using fvcore's `FlopCountAnalysis` directly, be sure to add our op handles:
-```python
-from fvcore.nn import FlopCountAnalysis
-from natten.flops import add_natten_handle
-
-# ...
-
-flop_ctr = FlopCountAnalysis(model, input)
-flop_ctr = add_natten_handle(flop_ctr)
-
-# ...
-```
+Please refer to [install instructions](docs/install.md) to find out whether your operating system and hardware accelerator is
+compatible with NATTEN.
+
+## Feature availability
+
+| Problem space | CPU backend | CUDA backend |
+| ----------- | ----------- | ---------------- |
+| 1D | naive | naive, gemm, fna |
+| 2D | naive | naive, gemm, fna |
+| 3D | naive | naive, fna |
+
+### CPU
+
+| Problem space | CPU Backend | Causal masking | Varying parameters | Relative positional bias | Autograd support |
+| ----------- | ----------- | ------------------ | ------------------ | ------------------------ | ------------------------ |
+| 1D | naive | :white_check_mark: | :white_check_mark: | :white_check_mark: | Forward and reverse mode |
+| 2D | naive | :white_check_mark: | :white_check_mark: | :white_check_mark: | Forward and reverse mode |
+| 3D | naive | :white_check_mark: | :white_check_mark: | :white_check_mark: | Forward and reverse mode |
+
+Notes:
+* Forward mode autograd does not support relative positional biases and causal masking yet.
+* Relative positional biases are not yet supported when any axis has causal masking enabled.
+
+### CUDA
+
+| Problem space | CUDA Backend | Causal masking | Varying parameters | Relative positional bias | Autograd support | Min. Arch |
+| ----------- | ----------- | ------------------ | ------------------ | ------------------------ | ------------------------ | --------- |
+| 1D | naive | :white_check_mark: | :white_check_mark: | :white_check_mark: | Forward and reverse mode | SM35 |
+| 2D | naive | :white_check_mark: | :white_check_mark: | :white_check_mark: | Forward and reverse mode | SM35 |
+| 3D | naive | :white_check_mark: | :white_check_mark: | :white_check_mark: | Forward and reverse mode | SM35 |
+| 1D | gemm | | | :white_check_mark: | Forward and reverse mode | SM70 |
+| 2D | gemm | | | :white_check_mark: | Forward and reverse mode | SM70 |
+| 1D | fna | :white_check_mark: | :white_check_mark: | :white_check_mark: | Coming soon | SM50 |
+| 2D | fna | :white_check_mark: | :white_check_mark: | :white_check_mark: | Coming soon | SM50 |
+| 3D | fna | :white_check_mark: | :white_check_mark: | :white_check_mark: | Coming soon | SM50 |
+
+Notes:
+* FP16 kernels are only available on SM50 and above, and BF16 requires SM80 and above.
+* GEMM backend on SM70 and SM75 can only do FP16.
+* Tiled only implements 1/3 of the ops, is only implemented for 2D problems, and requires head dim = 32.
+* Forward mode autograd does not support relative positional biases and causal masking yet.
+* Relative positional biases are not yet supported when any axis has causal masking enabled.
+* Naive backend allows FP16 for SM50 and above only. FP32/FP64 are available for SM35 and above.
## License
NATTEN is released under the [MIT License](LICENSE).
## Citation
```bibtex
+@misc{hassani2024faster,
+ title = {Faster Neighborhood Attention: Reducing the O(n^2) Cost of Self Attention at the Threadblock Level},
+ author = {Ali Hassani and Wen-Mei Hwu and Humphrey Shi},
+ year = 2024,
+ url = {https://arxiv.org/abs/2403.04690},
+ eprint = {2403.04690},
+ archiveprefix = {arXiv},
+ primaryclass = {cs.CV}
+}
@inproceedings{hassani2023neighborhood,
- title = {Neighborhood Attention Transformer},
- author = {Ali Hassani and Steven Walton and Jiachen Li and Shen Li and Humphrey Shi},
- year = 2023,
- booktitle = {IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}
+ title = {Neighborhood Attention Transformer},
+ author = {Ali Hassani and Steven Walton and Jiachen Li and Shen Li and Humphrey Shi},
+ year = 2023,
+ booktitle = {IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}
}
-@article{hassani2022dilated,
- title = {Dilated Neighborhood Attention Transformer},
- author = {Ali Hassani and Humphrey Shi},
- year = 2022,
- url = {https://arxiv.org/abs/2209.15001},
- eprint = {2209.15001},
- archiveprefix = {arXiv},
- primaryclass = {cs.CV}
+@misc{hassani2022dilated,
+ title = {Dilated Neighborhood Attention Transformer},
+ author = {Ali Hassani and Humphrey Shi},
+ year = 2022,
+ url = {https://arxiv.org/abs/2209.15001},
+ eprint = {2209.15001},
+ archiveprefix = {arXiv},
+ primaryclass = {cs.CV}
}
```
## Acknowledgements
-We would like to thank NVIDIA, and the [CUTLASS project](https://github.com/NVIDIA/cutlass/) and team for their efforts in
+We thank NVIDIA, and the [CUTLASS project](https://github.com/NVIDIA/cutlass/) and team for their efforts in
creating and open-sourcing CUTLASS. We would also like to thank Haicheng Wu for his valuable feedback and comments which led to
-the creation of Implicit GEMM NA.
-We also thank Meta, and the [PyTorch](https://github.com/pytorch/pytorch/) project and team.
+the creation of GEMM-based NA.
+We also thank Meta and the [xFormers](https://github.com/facebookresearch/xformers/) team
+for their FMHA kernel, which is what our Fused Neighborhood Attention kernel is based on.
+We thank the [PyTorch](https://github.com/pytorch/pytorch/) project and team.
diff --git a/assets/README_pypi.md b/assets/README_pypi.md
index 1182a1a..5cf06f1 100644
--- a/assets/README_pypi.md
+++ b/assets/README_pypi.md
@@ -10,307 +10,87 @@ NATTEN is an open-source project dedicated to providing fast implementations for
[Neighborhood Attention](https://scholar.google.com/citations?view_op=view_citation&citation_for_view=Ndu0dUcAAAAJ:b0M2c_1WBrUC),
a sliding window self-attention mechanism.
-If you're not familiar with neighborhood attention, we recommend referring to
-[our papers](https://github.com/SHI-Labs/Neighborhood-Attention-Transformer), or watching our
+If you're not familiar with neighborhood attention, please refer to
+[our papers](https://github.com/SHI-Labs/Neighborhood-Attention-Transformer), or watch our
[YouTube video](https://www.youtube.com/watch?v=Ya4BfioxIHA) from CVPR 2023.
-NATTEN is primarily a C++/CUDA library, which has so far only supported binding with the torch API, and therefore is mostly
-usable through PyTorch. We plan to eliminate the torch dependency in the future and possibly support other frameworks /
-engines.
-
-NATTEN's python interface provides Neighborhood Attention (local attention)
-and Dilated Neighborhood Attention
-(sparse global attention, a.k.a. dilated local attention) as autograd-compatible PyTorch modules for both 1D, 2D, and 3D data.
-
-It also has experimental support for
-[forward mode automatic differentiation](https://pytorch.org/tutorials/intermediate/forward_ad_usage.html),
-and nested tensors.
-
-## CPU support
-Our CPU implementations are very limited and barely performance-optimized.
-While we aim to provide the best implementation for different devices, optimizing our CUDA kernels is higher up on the list of
-priorities. Contributions are always welcomed.
-
-
-## CUDA support
-NATTEN generally supports all architectures supported by PyTorch. More specifically, architectures since Kepler (SM35) are
-supported. However, our most-performant kernels only support architectures since Volta, targeting tensor core math.
-
-### Half-precision support
-Devices with compute capability greater than or equal to 6.0 (Pascal and later) allow running in FP16.
-
-Devices with compute capability greater than or equal to 8.0 (Ampere and later) allow running in BF16.
-
-### Naive kernels
-NATTEN provides more than one set of kernel. Our naive kernels, which were developed during the first phase of the project,
-provide a very basic implementation of neighborhood attention, and are the last resort for every problem. This means that if
-your device and software support alternatives to our naive kernels (i.e. GEMM kernels), and your problem size is supported,
-NATTEN will automatically pick the better kernels for you. (NOTE: this is done based on the class of kernels and not by actual
-performance via profiling.)
-
-Naive kernels are always usable across different architectures, though not the most performant.
-
-### Tiled kernels
-Naive kernels for the 2-dimensional neighborhood attention also come with a tiled implementation for one of the three
-underlying operations, which is considerably more performant than the original. However, the tiled kernels only support problem
-sizes with head dim 32, and up to kernel size 13x13.
-Tiled kernels are also not supported in devices with compute capability smaller than 6.0.
-
-### GEMM kernels.
-
-Our GEMM-based kernels depend on and are modeled after
-[CUTLASS](https://github.com/NVIDIA/cutlass/)'s [Implicit GEMM](https://github.com/NVIDIA/cutlass/blob/main/media/docs/implicit_gemm_convolution.md)
-kernels for convolution.
-
-Devices with compute capability greater than or equal to 7.0 (Volta, Turing, Ampere, Ada Lovelace, Hopper), can run our GEMM
-kernels, which are somewhat performance-optimized, thanks to the underlying mainloop from CUTLASS, and target Tensor Core math.
-
-However, do note that their current float16/bfloat16 implementations do not typically result in improved latency,
-due to a memory alignment issue, which we aim to resolve in future kernels.
-
-Devices with compute capability greater than or equal to 8.0 (Ampere and later) support GEMM kernels with double, full, and
-half precision (FP64, FP32, FP16, BF16).
-
-Devices with compute capability 7.0 or 7.5 (Volta and Turing) only support GEMM kernels with half precision (FP16). This is
-because their tensor cores only allow FP16 math.
-
-
-### How do I check my compute capability / architecture?
-Simple, just Google your GPU model, and check its compute capability.
-If you've already set up NATTEN, you could also run:
-```python
-from natten.functional import get_device_cc
-
-cc = get_device_cc()
-cc = get_device_cc(0) # Optionally select a specific GPU
-
-print(f"Your device is SM{cc}.")
-```
-
-### How do I know if I'm using the new kernels?
-The new NATTEN library sets up constants that are binded to the python interface, which will allow you to
-check whether you've compiled with: a. CUDA, b. Float16 (half) support, c. Bfloat16 support, d. New GEMM kernels.
-
-```python
-import natten
-
-# Whether NATTEN was built with CUDA kernels,
-# and supports running them on this system.
-print(natten.has_cuda())
-
-# Whether NATTEN supports running float16 on
-# the selected device.
-print(natten.has_half())
-print(natten.has_half(0)) # Optionally specify a GPU index.
-
-# Whether NATTEN supports running bfloat16 on
-# the selected device.
-print(natten.has_bfloat())
-print(natten.has_bfloat(0)) # Optionally specify a GPU index.
-
-# Whether NATTEN supports running GEMM kernels
-# on the selected device.
-print(natten.has_gemm())
-print(natten.has_gemm(0)) # Optionally specify a GPU index.
-
-# Whether NATTEN supports running GEMM kernels
-# in full precision on the selected device.
-print(natten.has_fp32_gemm())
-print(natten.has_fp32_gemm(0)) # Optionally specify a GPU index.
-```
-
-If `natten.has_gemm()` returns true, by default NATTEN will call the faster GEMM kernels instead of the original naive kernels
-for both NA1D and NA2D. 3D Neighborhood attention is not supported at this time, but you can still use the naive kernels.
-
-In addition, we will be adding scripts that allow you to profile and observe latency from the kernels with those options
-available.
-
-## About NATTEN
-Sliding window self attention mechanisms have been relatively overlooked, in part due to implementation difficulties.
-For example, in a paper proposing one of the earliest examples of such methods,
-[SASA](https://proceedings.neurips.cc/paper/2019/file/3416a75f4cea9109507cacd8e2f2aefc-Paper.pdf),
-it was noted that
-although such methods are theoretically efficient, they're relatively slow in practice, compared to convolutions,
-which have been implemented in most well-known deep learning libraries.
-
-That is why we started developing NATTEN, an extension to existing libraries with efficient implementations of sliding window
-attention mechanisms, which will enable research in this direction including building powerful hierarchical vision
-transformers.
-
-For more information, we highly recommend reading our preprints [NAT](https://arxiv.org/abs/2204.07143) and
-[DiNAT](https://arxiv.org/abs/2209.15001), and check out their [repository](https://github.com/SHI-Labs/Neighborhood-Attention-Transformer).
-
-## Requirements
-
-* python >= 3.8
-* torch >= 2.0
+## Getting started
-NATTEN supports PyTorch version 2.0 and later, and Python versions 3.7, 3.8, 3.9, 3.10(only torch >= 1.11), and 3.11 (only torch >= 1.13).
+NATTEN supports PyTorch version 2.0 and later, and Python versions 3.8 and above.
+Python 3.12 is only supported with torch >= 2.2.0.
Older NATTEN releases supported python >= 3.7 and torch >= 1.8.
-**NOTE:** NATTEN only comes with pre-built Linux wheels, and supports Kepler and above (`SM >= 35`).
-Make sure your GPU is supported by referring to
-[this webpage](https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/).
-Future versions will extend support to older GPUs.
-
-## Getting started
-
-### Linux
-Just refer to our website, [shi-labs.com/natten](https://www.shi-labs.com/natten/), select your PyTorch version and the CUDA
-version it was compiled with, copy-paste the command and install in seconds!
-
-For example, if you're on `torch==2.0.0+cu118`, you should install NATTEN using the following wheel:
-```bash
-pip3 install natten -f https://shi-labs.com/natten/wheels/cu118/torch2.0.0/index.html
-```
-
-More generally:
-```bash
-pip3 install natten -f https://shi-labs.com/natten/wheels/{cu_version}/torch{torch_version}/index.html
-```
-
-**NOTE:** If you do not specify a wheel URL, pip will collect NATTEN and try to compile on locally, which depending
-on your system might take up to 30 minutes.
-We strongly recommend using our website if you're a Linux user.
-
-### Mac
-Unfortunately we are not yet able to build Mac wheels (and do not yet have a Metal backend). However, you can compile upon
-installing and use the CPU kernels:
-
-```bash
-pip3 install natten
-```
-
-### Windows
-The current release has not been successfully built Windows devices with CUDA, and therefore does not yet have Windows wheels.
-If you are a windows user willing to help us figure out building with MSVC, please contact us or open an issue.
-
-### Build from source
-Once you've set up your Python environment and installed PyTorch with CUDA, simply clone and build:
-
-```bash
-git clone https://github.com/SHI-Labs/NATTEN
-cd NATTEN
-
-pip install -r requirements.txt
-
-make
-```
-
-NOTE: NATTEN will use the PyTorch API to detect your GPU architecture, and will by default attempt to use 1/4th of the number
-of processes your system allows to build. You can override them by passing in the following arguments:
-```bash
-# Build with 2 workers/processes
-make WORKERS=2
-
-# Build targeting SM89 (Ada Lovelace)
-make CUDA_ARCH="8.9"
-```
-
-Please also note that building with the latest GEMM kernels can be a bit time consuming, which means at least 10 - 20 minutes
-given that you use enough workers. It is technically possible to improve build time by generating more source files and using
-more workers (at the expense of generating a larger binary), but that option will be made available in the future.
-
-#### Optional: run unit tests
-You can optionally run unit tests to verify building from source finished successfully:
-
-```bash
-make test
-```
-
-
-## Catalog
-- [x] Neighborhood Attention 1D (CPU, naive)
-- [x] Neighborhood Attention 2D (CPU, naive)
-- [x] Neighborhood Attention 3D (CPU, naive)
-- [x] Neighborhood Attention 1D (CUDA, naive)
-- [x] Neighborhood Attention 2D (CUDA, naive)
-- [x] Neighborhood Attention 3D (CUDA, naive)
-- [x] Neighborhood Attention 1D (CUDA, gemm-based, SM70 and above)
-- [x] Neighborhood Attention 2D (CUDA, gemm-based, SM70 and above)
-- [x] Dilation support
-- [x] Float16 support
-- [x] BFloat16 support
-- [x] Kepler and Maxwell (30<=SM<60) support
-- [ ] Windows builds
-- [ ] Neighborhood Attention 1D (CUDA, fused kernels)
-- [ ] Neighborhood Attention 2D (CUDA, fused kernels)
-
-## Usage
-Simply import `NeighborhoodAttention1D`, `NeighborhoodAttention2D`, or `NeighborhoodAttention3D` from `natten`:
-```python
-from natten import NeighborhoodAttention1D
-from natten import NeighborhoodAttention2D
-from natten import NeighborhoodAttention3D
-
-na1d = NeighborhoodAttention1D(dim=128, kernel_size=7, dilation=2, num_heads=4)
-na2d = NeighborhoodAttention2D(dim=128, kernel_size=7, dilation=2, num_heads=4)
-na3d = NeighborhoodAttention3D(dim=128, kernel_size=7, dilation=2, num_heads=4)
-```
-
-NA3D also supports different kernel size and dilation values for depth:
-```python
-na3d = NeighborhoodAttention3D(
- dim=128,
- kernel_size=7,
- kernel_size_d=5,
- dilation=2,
- dilation_d=3,
- num_heads=4)
-```
-
-Modules expect inputs of shape `[batch_size, *, dim]`:
-* NA1D: `[batch_size, sequence_length, dim]`
-* NA2D: `[batch_size, height, width, dim]`
-* NA3D: `[batch_size, depth, height, width, dim]`
-
-
-### FLOPs
-We recommend counting flops through [fvcore](https://github.com/facebookresearch/fvcore).
-
-```shell
-pip install fvcore
-```
-
-Once you have fvcore installed, you can directly use our dedicated FLOP counter:
-```python
-from natten.flops import get_flops
-
-flops = get_flops(model, input)
-```
-
-Alternatively, if you are using fvcore's `FlopCountAnalysis` directly, be sure to add our op handles:
-```python
-from fvcore.nn import FlopCountAnalysis
-from natten.flops import add_natten_handle
-
-# ...
-
-flop_ctr = FlopCountAnalysis(model, input)
-flop_ctr = add_natten_handle(flop_ctr)
-
-# ...
-```
+Please refer to [install instructions](docs/install.md) to find out whether your operating system and hardware accelerator is
+compatible with NATTEN.
+
+## Feature availability
+
+| Problem space | CPU backend | CUDA backend |
+| ----------- | ----------- | ---------------- |
+| 1D | naive | naive, gemm, fna |
+| 2D | naive | naive, gemm, fna |
+| 3D | naive | naive, fna |
+
+### CPU
+
+| Problem space | CPU Backend | Causal masking | Varying parameters | Relative positional bias | Autograd support |
+| ----------- | ----------- | ------------------ | ------------------ | ------------------------ | ------------------------ |
+| 1D | naive | :white_check_mark: | :white_check_mark: | :white_check_mark: | Forward and reverse mode |
+| 2D | naive | :white_check_mark: | :white_check_mark: | :white_check_mark: | Forward and reverse mode |
+| 3D | naive | :white_check_mark: | :white_check_mark: | :white_check_mark: | Forward and reverse mode |
+
+Notes:
+* Forward mode autograd does not support relative positional biases and causal masking yet.
+* Relative positional biases are not yet supported when any axis has causal masking enabled.
+
+### CUDA
+
+| Problem space | CUDA Backend | Causal masking | Varying parameters | Relative positional bias | Autograd support | Min. Arch |
+| ----------- | ----------- | ------------------ | ------------------ | ------------------------ | ------------------------ | --------- |
+| 1D | naive | :white_check_mark: | :white_check_mark: | :white_check_mark: | Forward and reverse mode | SM35 |
+| 2D | naive | :white_check_mark: | :white_check_mark: | :white_check_mark: | Forward and reverse mode | SM35 |
+| 3D | naive | :white_check_mark: | :white_check_mark: | :white_check_mark: | Forward and reverse mode | SM35 |
+| 1D | gemm | | | :white_check_mark: | Forward and reverse mode | SM70 |
+| 2D | gemm | | | :white_check_mark: | Forward and reverse mode | SM70 |
+| 1D | fna | :white_check_mark: | :white_check_mark: | :white_check_mark: | Coming soon | SM50 |
+| 2D | fna | :white_check_mark: | :white_check_mark: | :white_check_mark: | Coming soon | SM50 |
+| 3D | fna | :white_check_mark: | :white_check_mark: | :white_check_mark: | Coming soon | SM50 |
+
+Notes:
+* FP16 kernels are only available on SM50 and above, and BF16 requires SM80 and above.
+* GEMM backend on SM70 and SM75 can only do FP16.
+* Tiled only implements 1/3 of the ops, is only implemented for 2D problems, and requires head dim = 32.
+* Forward mode autograd does not support relative positional biases and causal masking yet.
+* Relative positional biases are not yet supported when any axis has causal masking enabled.
+* Naive backend allows FP16 for SM50 and above only. FP32/FP64 are available for SM35 and above.
## License
-NATTEN is released under the [MIT License](https://github.com/SHI-Labs/NATTEN/blob/main/LICENSE).
+NATTEN is released under the [MIT License](LICENSE).
## Citation
```bibtex
@inproceedings{hassani2023neighborhood,
- title = {Neighborhood Attention Transformer},
- author = {Ali Hassani and Steven Walton and Jiachen Li and Shen Li and Humphrey Shi},
- year = 2023,
- booktitle = {IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}
+ title = {Neighborhood Attention Transformer},
+ author = {Ali Hassani and Steven Walton and Jiachen Li and Shen Li and Humphrey Shi},
+ year = 2023,
+ booktitle = {IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}
}
@article{hassani2022dilated,
- title = {Dilated Neighborhood Attention Transformer},
- author = {Ali Hassani and Humphrey Shi},
- year = 2022,
- url = {https://arxiv.org/abs/2209.15001},
- eprint = {2209.15001},
- archiveprefix = {arXiv},
- primaryclass = {cs.CV}
+ title = {Dilated Neighborhood Attention Transformer},
+ author = {Ali Hassani and Humphrey Shi},
+ year = 2022,
+ url = {https://arxiv.org/abs/2209.15001},
+ eprint = {2209.15001},
+ archiveprefix = {arXiv},
+ primaryclass = {cs.CV}
}
```
+
+## Acknowledgements
+We thank NVIDIA, and the [CUTLASS project](https://github.com/NVIDIA/cutlass/) and team for their efforts in
+creating and open-sourcing CUTLASS. We would also like to thank Haicheng Wu for his valuable feedback and comments which led to
+the creation of GEMM-based NA.
+We also thank Meta and the [xFormers](https://github.com/facebookresearch/xformers/) team
+for their FMHA kernel, which is what our Fused Neighborhood Attention kernel is based on.
+We thank the [PyTorch](https://github.com/pytorch/pytorch/) project and team.
diff --git a/assets/cudamemory_dark.png b/assets/cudamemory_dark.png
deleted file mode 100644
index 16337355b3606b3ad617c5c221c45b10ebd125cf..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001
literal 172479
zcmeFY`8S*E|23{!)ke{tQf(DIr!6&v4(3@WRr5SgRWydgtVAe1s;wGY6>1)97A2-c
zglZ{DN)SY-nwmtYAqhhA8om9v*%Q56|JPV@J8aNiRq|%zZl^bl)+Qhe!H9KZmvg!T~(I#lc2*3~Zj{
zt&;d>s5S?E8{2Y1A5Iw9Ji7XWSJd)cqf5ZaM*yJ<3g>Rzef|ni^3~!9e~H)zg7=lc
zsXHf+7^IgU`)c*P@zgoF8(=mT5^Yw9$CsH-Ym-CQx1z9aux3*;GrY1<#+FuTCiffs
z@9SwVvKPbqzbo#$iOdnoq5rNQpNx=_l)a(>FmOkgd3M%y&t%9!uOP2*qWFjuqd6=(<
zLa>kezUOD`M};LOSqK6#L;j?Rau;YUw2!$Q`y6GJCZvQFyssB=p;{BV1HeB7u{x@~+idrwS3pt*VG0j?@k53wokFx@(df6sUV9$v*wha{b75;7f_idaX|+2po~
zio&WGsq0#Sef_HSTmDnjJI2ftP^&9Bf+0Ts?$NW4W`#>pA2{NNc!n-o2zyL736Of;
zzRW!-y@x^++BU$
zz@(pn_7lh9eM|F4m{7zilRZ>&!^}%)QR{t_~
z#!dcvtnDZmw7u{LwV_id-#aad_HnziF5}qP;qg3H^-?PHTm*X|rrPhAX~}l>9i#x6@IQ5Pp#w%^+i{n8iq3Hv
z%>)`zTqC{!d}d^PCS1xJg!MDm;?J09L5iIzv-SAV)LoVcr0J15+=+PSXGpzV?%n;M
z5w-fp#O3d}J~*}v$9ybjfi2S^oiUw_u_*ckxBeb}h{-6675yv0TO>$FJtw9Xh_)~E
zj`c8i79M~;t-V$}+%bb%3Pj(qG2R;sN?204`Tkdl>pJH#$jvtmdG)k$Q-zL%xJmTK
za!%h#D&F3jy6jY;>yfr{B*V5Q#xkzf)@5C65>HyAb;A2ByE{g)`Q^5eRdc4(nV|se
zvvOxrfnut(zn1IGAE6cBHlAxdqw0>>^F)1(-AYRfg?*kd;>+b^h1u`*8%L(rt)DqS
zT`f@YpSYqLH~v-%hAwxNTM0EYnkFSI@3sL$?-YiQ>0~ABA)5OP;EdmIsa7xab-vpz
z?cXiZB_6H*y{!Z72(++*R@N+H0>pJR$@1`4-Ye@_e)}DDtK_lAo2N|KGSiwJ1WEUg
zxIBnHil{K8$WYhvb>nVvBeNFppg_cAFL1QeH}<ig_H*}2w|2^0#i!SzG`c9fkKb9t%afRbw*>^DTBRBZ
zGG9Om+I?p#TS0zb_Ft92B@|7Ux{6^OahJg_*9z+c-*0K)Z&<9ybdAE$S2Ke7C^G&e
z0Kn#MpY_s44p6Ud%Y?}ohxdhdcMM?j1t#N)+>_l2m%1bwwO}OHBMot9t77|~T|5{T
zm!aL#*iEAGD@ktc{faa1PP+T7->*A1Zgdt=cp6wT@mQSEuf$$otB|oMY(ZyluPC`n
zRc3zgsP6CusR-2oTqAA>{Q~3ml|Mg$Mf}q-%E$$voC$6o8j8@rg9E>b(
zDFlL3$iI#*$uOdus;NFvGv0&1wOO+%eN0m;Z1y?~UL&H8+^)V$J6zs62~82}Y=B~4
zVLEF)+9Rj%0F5T)=l1HVKU0jfg5IvM;GC53<)w)_e_}g)Sx?qMF)aG5a>U@UNr^|B
zDVjJ6bQU>s-nS_B#>7yna9|d2#+mP{U2s(XO|Rb9^ydx`_v^R&NXaEp8?lI-gU%aI
zo!S*VK}h%d_ef^ia0z2lwf?cCXa0wOcXd{(r2^8{>ZSCfkv`JyVa{~ooAb-u3T)PV
zK=gu9c66`5{{^rE$WPtnl=~%y7!;;)vdVRIouv50ZGKnJ1K~`*>b0GYFOoPD0DHyN
zUmeJ~0bO;o|5GzQx~OhoH(4JRiSwwpZK=Pa+|pc+v(caoF)WNdq%1~$OY*B=>K7KB
zROQsGspy4O*7``7uOD>ooe45;f?gV(X{m3|(Wx%_uvVtln&f0g!CGWu?ONz8=sWub
zR{H^tE&W)O-G&(!ub8a>!s3WXOWtn`j3SpDBOv2LyV{IcyJ6p7nm;a$JW2s_(0Rz<
z@rcwb*Af@AVlcvHKsOR+IWpc$w`8@jM+lS;>Em
z);^7!cbKf0PTWw{Ad4VVoO_oip0*<8OhyFX3r|Ws1nK75q_?04Y&&s586TpH+=v!H
zzEJc9slS4DjzcxaFt+i}!Qj@Ab(P+!PEmq0aii+Tfw}Lu?2Tc$sMk66!!24DnWnonoH*I)T|P=C?j
zP`B`%E#rd-xtZoZLclEX8Hm%uL1hm?wNPn;~3b_2-ph)s@O$a+@I6gWOQsRF!PH{}7{}tmvYS@TXTSf}~
z`ik+yz6SUGZs=buwr1r=WnJA_V$zk~%CI0=@P95`fR6^Gu>?Zko$s}v_;
z=0}9NDim?iwVXx#kn)E|i?`^zgU8p%OP!4oU+cx$0eE*&(_vbx)_6d$GH=Wg?(A~#Y6M>Bid(wJF(N-weuI^a2c%9^I@b2ju-BoEK)=lVK4m!}dx
zCh#bQLdO4KXKH{xO|Qo__uW{d`vw2`vfQ=5pq{HOJwUOJ!THL)Q?Tx`>iE4@=RJ3H
z6bfWxaog*sZ1!*d+{j3XV)=ypi3=3z2h9yI1aD%U+4Ex;-2BZE5fITM?pK3>*|xV~
zy|2$1+qqXl?dPnW;)M|4A=@97*Tb9Ft6J01
z_grL16$%e`KkH2W4cgX2zlG+b3#Yf+SII9uMwj6$Y4Wvw(u_rctkx!Ptk{s=XhBTz
zLWpNMuYFL>8K&G-8oA(!NY4?9A!)@>WBHzr+o088T^R}esWoKh9;CHCzV7ppQ^mE82o-xu~IZTfOiH3;>lE<7gh6#HlFfuHz
zen-Q}T|&*cO&i?Qr&4P3SMbWyOxM_~l<0l?{ngPGM1QAUi*}fh(}d)-a5F^SA$0r&
z(T??Z?7A;{pMA}wXqYb&AtbFbY88_g5{s7~Sj}z8TVsaJRUAZtoR=91VS9z6zh(AF
zbk;AIkmmBZA_|Htv&{qBO%sd+1fjYhO2M_gZ;kPL516fdQLkgx!dDa|PFMe~Q7mni
zC}>*Hkuo&5lO1&dQ}|M4PhnX8=8@dkde@$gOES1ZnbXYEPJL={$E>$h
zt+9<$OP17#yw}uzX%oh7cs4Bdb^P9&HQ6K$!?7SKYbM=gi9@r*-tMR8yk<4O!Cm5c
zs>(~}Nq;1kJ@2q+PV@qk9z6#oOs}IVZ3#Y|5v;zzy;GL4lu?BXmsou83U)@H#OIXFZI79MSGev+zt)W3{x$4n3c>y@U88ljNE7wWH%~2PlHU|*uylh
z;WnFAPeIV8KvLp}^3uomI%8hns6V5Opif?y@BQd{^nLepK}#e;s(Hbeou?ce?+#`*
z!gMd~LtF_Qd;TpwA9MEkB6&z|Ynl%1Iw-QZ<5s=_nOh{5%w5aWcFQzE0{u9mc(~Df
zE)X9!JQ_K67Kc0Ww&uoQeCLm;QPAw6HFW?YdCYWUWph|VqmXk5dKH&-ugOQY@*!SD
zx!+v?E|^ADQ6M#|nsU+rfZl~^N=zlSM~l=5I7W$d0zQb$)M;rAQ@adn^P0DS#zOob
zUhp>OoSVk!;a6tPftfdfB!;Rf(c<>q#|28x7D3CaWh_m5LT)g^#&cF&nsi8!FI@3g
zMr`F4;gO5H^LfEH$Wome@5P@QMmCcOfKH*3Ccnt47~m@i>w4t@G;L_EUs5l5i`)md
z=0glVGr9>(lc-*t-`stzLz6U#zAcX>WJu6tS8qiDjhTIUVN_|1D|BvcS#(y3Z*V1F
zSmZ)K%k-{mKUEEe{J|RX61JVTy9=Ljn{d(Ovk9Z^l*~Gk+goPlqd2*D1J2n7xawsi
zANjv1$aZmSrmY-aa|}2
zMXdq)z4`ExA=V46B+Nxk$#CFeQY2-n@ImeRcji0Lmm3W^YhIMjf|_lt&Zp*YU7X%u
zq4!YkxwTuzc#6cibJ6=K03MUM`e~Em(WzJ<6$XjRKpv)7v`6_8hi*AEyyX+3fs~kg
z;9Sn9$FZJSf3GbE)%{U(fS&lQ{v;}@(>f6ZmxjC!H&<&~m)mfZ06~6OsHvVer`Cbl
zI2sH5w#g;4t5`T^U>!)S2BW|)+~z;jbZtEaP<8r_QidWkA{rJR>S=+*^4!h=Z>FB1
zHzRTZNAVGAboPg90tHjb;ZRsk?g#O`l`KJa*=b=2oHtAD=>jQ$Y2}3Y&ExypZjycr|09
zV`x)H$-yphqxvk~nk=1^>Mt5LK5K1o-loy|&uj?+9fKRW^{|~ikGIkEw(?KDe|z1l
z7H5Z+?y|0&?yfW|a!c%E?BvP^)Q-KvE=Fep#?nx@w2*Hp(H#t^?It
zvo3I5fK$(p!eMjQIQijg|G{4jbKBwVZBbvkyW=i<0yg}0#Vsq%sRL&cEnk}$hB?Wo
zwZCG4@I*>7IIU4E$LabGiahEU_7}DPb2B6Nlb2>7GvRarJ`%Yrb8lA=l~zgHIkrq_
zk)XB}AhpLX<#1-(fp72;cU%1vm}w50Dfznc4=SL>G(+|b`dRsVjmKXon%JYe<1oxA
zeoA3fR6Y0jfPX}UeXKE^_!fC6vB=6p4_o;tm?UyQ#|@g{}St4e;FE2
zN5#awiX&UL-?vXQ?f5oP7MT7a=(kbDle533ltquS&VSVbLit6aUDC%+SK8CO^vrSq
z@5cleM!S{Q>2RFw61hX=!kHP5njVlm_h|Vr=lNHugO1*nWN;(IS%3P62aa3QRC>v}L-@|${U^qf@asm?
zuA&s?IQx$l4}5Ix8bRuH)?mNyJKp*RCcqq9A{;?##G*PJv
z%RlF@bYE_MVtrSP2AQ@W3)yO|OPD=2>XDst87oc<+IOG_1!HpS;ZuG__Wt(G-79eN
zXxJhw_OloGmp6Y5^LO+LR&%b|#n>lM>_F=BX;xwd>~7y=jjN_;rToQO@PJlgb^Y&H
z?o3Z+C3IoIt~*%NM)%RynDq;%MAm|3Va36G|JAhNbsq#oumvk6q`!6MKB_9D+h>iA
z2V>pa#)(b$M$Yyj8v2&%ioRqBJa_lO-E=Y7^
z0$h;oj4=~iwA7v7sC{)OmIE7DCLd_my|imCkgEttw@HtPG_Pl-IZ2+E-=%JJPRj^~
z2iN((Zl7Tl6ug~*RC9{?o`JB9VbtZtiqLoKS4ybU)h5^e(v`YQlROc_M~=mk#eoRD
zzfZ45-(Qefr=`ZXiXHI{%PTzBfgL|kdYr3J>~AA>)h5xB&%6G`*4GS1gh77W7TxfG
z^T6?6oaO??2*GQ%1!9*;2FO|}rwnFZ0kDiB(e2MWlXMM%ZU;K4a#Y~yVd46AdSN9?
zIA5kUyouQwWq35I?hO*E@H^V+udKFz;2E?#j`|m#+t^5tYoK$+Tdh}3V+v&fWgOiH
z6_VgjfcM7Dfajer#)D%8;dS)rOQ5S3djB6HDYKoo4go#8PbC8khPmO8rB9
zn(>;V4Le&U06&J{GPM_M&Wte5W%gV>IT5}Ppe8H#W1|ip%^PX{;91(GczuoX8NL;TkYCCl00K73kFfdE9PcDuK8cNFzZrI%V|OQ^QnXk
z9u)%SgJ(WXq~;6k=jHmsdR2^$lgCbs$xrUlb(61p{N76F@R2@UH_>GGLqiSfTBlxf
zq{s036x9bL8Z$SHNr61k;_GDW`7+P}IT?5J3VAR3wzamC9EO4gOR;<-wy5g~xR*a3?12^9bgrC4SC&24K
z%xs$DmqWQ&BZre2E%I1$_NKvkTCV@1gmX_L?)%3-&ahX#cBS$&%s7#r=4a#{Q$c*o
zj5Oh8kp%e$XdRR&8@8QbSGs4OjTo<2M!wJi`!_qE#*P=@BBNy0>k=$QANdohPxjsp
z2@Cu*mJMDz8&xQ^ku4rQ^~apfdbDBpjpkKM7bY%KCl*f;^r=A~xY9Pf@^7Z_`THX?sR4e4o@#*Cw(GQ+#(=x!@=FE%_nDd9qN5M79XQo9PuTDM{|An
zx*1V)IjlbV#>&}o2Bm%32r7@Sq)&G#_9SV?jYrqzwIsD$dRSw?c_N(Fa*UUqShLeG
zQ3+Wm|YCtRtdS-P<=ES{O
z@SuX|kT4>yAfk}z-r8w?lx*|zzU#tXXo!m?%CBhFJUPxf+?Ru~i=_5l8Q2$(iW2$gZF|V0XSToUdCrd2*~jP8%>!Ax
zMROO`h3I63;Hp%%yVM(ZV`T>2Gb$rK~k~#gy1qW(BR-=@U3lo{s|a
zPV;zMFCZCOSiDu5>Km9urE>!CA|2Yj<+SP<$kw)dcn7q`{{$cVr{~-1Ba{QAXIh$5
ztCuSw0|a@N-0>iZJ-NAZiCi@UH?wJcSMci0!Pt}7{MXbJbqfT{0$ahhA3lBt-^ubW
ztX&T`w0-(>keJ#0?B_ae-03EvknHR^>8lktD#kxgCG)sl!{F{r!=0
zM=AUxy1SeI0dx0nv!QL}vkRkqif;1)ckvrXDE_W$t-_TaV`+CWH5j(vHFu!1a3sVw
zySYK>BDJX#Rb}^QSC1mhJQA&eCH2>J$48Hs2=|E%QC&;@ieg*T{{gnV1p@xWmwzkN
zS^7-vGnBND04#cVP%$4RkCopUF7LZ(E~93FxL5-)i*_1c3TUmbe>T{_Y13|xu=S!YFQy~stLg-iP6^QVR#%&$eAdgwmpjVp4Aw|W?ng*Yc
zq|7?TOR>u|TF1kn*V;;ftv~GM$M|9uaA74&-*;7U!q&xSfPNZcX?VqU*IP{MpEK7t
zb@hH;SKDs9vblzuH6qr1A<*L!JoYM9$r;_ygY{ZGB~Z4Whc4S)RxoQGfW`i~qi5%x
zs@@vjs0bm*x@;NIi+~7eP=5G0|uEQP4FCYXz8b!
z2LAJwONp(QPerK<2h#=ODohQvJmWNy31}g=zXzf
zxiF^d(!R<$!T`Xexg@x}bT_}?%66gO%FBXz{t!m8#8&+-e)i9B6ly@RXVc=#1y`lT
zaOXOzbh8jA1RH~2TTAgiZp>kN#5%mZGuPvOIPAc^43nvjt`~kVtCV?QDjI}Vc`|o1
z%<)`HUg~MRnVu^QjT)x0`*nybQSNce1A{+O4r*Vi&QT%*omXPrmeqpDpC&$2*g6Q?
zaBefHIgC+-ctHUij!30b^N(n~6RF1IXXxiYEV8=G#+~Z>3Vy}x2jwkXEHLVpZi1w&5idCwXbvMX!Z4O&TUIj}NzuP|tVU3DYXVo+hW>5Sk@mip+Wv;!
zYS;lMYbX(-uoK%H1s)T7uC>_pQNT!a5khr~>_wTP{jrgXIi?EA&WtX_XSJPliiuF->Ug~yqkFO!D2We%gnTv43
z=C%{mrq+=$R<|^L`6;50HeAWo(k)JLdmiadLlO(iUlOIcAIt@mam9i{0Qsz#P>px6
zy5x@*RlQo3!q(2#D>NsnpJ#iNL5?Rc-LF5&{kJY6@!`1sK|pZF-Ka6qNT=m7d+hMN=j#Bn1PErA`Xhg7BT~;yDjHi
z<)(oI3YhkX9gmfTm4RFeWH+wvCH#`(pGZ@v(Wp>NCEa8q=j(?meWp(q!
zX;kX5;f;2_$FLTIxr5SFSnNA5@H=ngMOW1z5sv#dk`hV&d9ZiHSS*5E *->=LK;
zX9lF^nJolAOD*>rbZdrHUw9apU04n^Ge8Y2^nhhy~uv(k_uD3}>kULZo^G$Z%w54788eQhP|EB_%>^f}{a
zPqCQJ@%rni=e1X(ur^C-;}Oh5i^Z|{UopE#R!0*{!Ad2LZwKkhFpox`Vt_<`TP@|!
zoMCZArRZ>LGe3o@$()ArnaxRE#zj}ieU)K%+8(CwAuzJp$P7;Y=$hnS<9dDzG}9t|
zO4t%lE>}Xe4Dzxx3hlWq%l3xlN<=Ly`iA<-eS(U4Sm*SgAkI;gB0juT?%qD3Jozx2
zNbG1Zd}nn}1FH(dyqOT^3dVv)zjN6(ZE5HF=q^dt4CP2
zXXzx8mK7h0!G)*8NZSXa<-74Dn;j4KbpIVSe-08UfF{d%Y}ZlOTep*!VOwgNg=$@d)aL)osd*9EVtcm3;!
zm)E&KX}!8$0FEVX6W_jE`w4iqt{C{_5>eJ<&`8as$W3k1Q@?~ed};c!ac0TVzQ~PG
zC+8A%H~pRgPd>!AWiKp0kVVQjh))__`|b-HUn&(g3+G$iGwKtsv_E7qX&X^KF(?XX
zo6SHvUy7p4Zv5^*Dq?s$Sw}M_q*RJLqZA8T&|z_%P{bJ;CEv>ecWye8^X}%s6bPNX
zZTqYrJ;hphrmWd3a)-&c3{b6s+Q+TRdSEfeHESBpapA){s-~_zlxDV?^6-)=crSUi
z7~n{8+PVdGLw|}mD^e-NRfkL2;@(pRaz|2?;cIuh%S!ZN_T|-=H6!p5kYn@otb`#?
zh^(hu^~|}{9!dV;!85DX$^LVj822JK4nO)x#?LeT`Ba@!ur&{!2*GUrx7OD!ZVv&|i`s
znU8*|rI*ugLXzR~X!p-6KY$9fw4-(Qg>+X#rrz50WkkSzzBNA;v9yI+K-M%4_SBvb
zq17JcSGA`U>4raPIIXuHOh!2s5g`==){wNVW4aY<`Ul^cdIplgt^EMMO09jb1)B0J
zBuNn4(g*JUOm{IcxFaa`ef&IfIM8Z7M(bSsp3^YK^5lcMoN&o+5;tQkp0=N0X~W{?
zwg?-HXf?Hi#*0tTF+sqSLRUT}nd%Lf+Y7LEXSVAlKPS-ZZd%Mmx>ts3O~7={!LjOo
zL$I{m3h42B*qeb>>m%{Jg=xXSuYi^yFo+Qd3-uK_>Le@+R4Tpmtd0H~i&-uL$=$jj)3=SLaT
zXjQry{&tUVZV(@6Qosq8Do-ng)mzMGNts*qB}emF+LO;evlorq|9u3}0#YlZUBo67
z`T?`-9s43*R(Yfi+j@&4``7whBnBDQe+HEmjePkB9vC^N=Q=u{_6*mEi|n>j(mWUsOF3{gwcO$A0v1
zBU8U!HFrhFc40av>W|xwJ
zxRt5aQ~bw0z!?9o-!COWLa|re+T()gUzU8OZOP7j^K)dk$QJXk8E>7iveTkhZH*SA
zRB0cPBnf&?H~q&1V-NZ3rk~(gS_tPf%*UZ!wrs!y@S*y~=z_*p(yoFl3%tM^tK(Xj
zdpmHiqQu*eDD5RQMhICmedcy}^bqE@AmCb-XH;Gx636#eFn%G{(Dv$BxQYE0A1K1@
zickmy3n9l#|r
zxwcTV!ExltPUQx3o@U$wzI@A2x0aa>bNzd7V-MxWo=*IA#h|MzO`^_&a~|xpw&Q$7
zpumT~9l<@d&4~C@t)Ks44{u&cm*)&D^gUfR{8GMIsXiw6%xqCrLlE}_iWGwMiVKgW
zHG)Cw4S#skMc7B9_7>1V*@v>g+(BNi{t2|nD)Nc&=U2U?kc59xly*W>mb
z5G`A!p{_No4+E~a4H~X=x6e>#X4?{QwU;dB93eCTMwAlLR|4PJnqEKK8Bh?NxswUa
zwRLC&0k#X*Kgj;pFqf~`G|jidQdP-G;7qC*CPA|ZbH
zfj;RVzUc(jw?WQ&2#4gNlE|?XzWBb*psp#>zj(E|5D2GM#+jXmA2p>cq{a<*R|;
z15hE(H?zLWZqu4(@jwd%@A2@Ob;!kVuj<^-4b9zcy!6|3&gYQ>HNOn&UVhB!*&q94
zOA_*fr8X%;w-q$rL}cq9eES^8FM|;8dc!?3hu%+u3Rr3#l@DgXT^&K>y5ZpPJvC
zAwX;eZQ_fkJ&rPQqxI*#RC$H8xlAMVWiAzLB?tJBiZcdM5XykXcBt)3|xhuJjmDjs$tP;IT!G%i;s
zpZb;_!>J6a4KqJ>n3tZ>h|AJhrnsrfAU+(XPp|aYxkr-_NeSt_QS19>Opj7XF)+pn
zrd{7Iz({I-uYBaUDYyJcjeBuSAX&I+Nyv5C%EPj2h2_J8`
z%1HjND*}rL(Co7>o_Y8w6onj(tDk`3?F^zb%{v-@G@a)jO6?Ld3(=#al_A?`5;4H}
z4&hKzi;H*v#Y)73lX7!Wm*slizYDElt$z9swTB;}P&+H~7WEI*>sNYDRU+s+|AeLg
z$qn-yLT8WN-HuqJw@19GO16qycta4apDiT2xr4XHe=>_NzC;xZ^-(tmhsg2KDdtu1
zJDp$pR%i#exFyoR#hXO=L$UT8LQw_X>vH011IplcXXpaS7gJ$dG;6y4+S~`hA^9=c
zCsRkX;ra*HsLL(wPENb@3`U0Au^x35a2zEvX9X&CpPsF>L}1GBqO>HZ1MftsNn#P4}%@HNc(;{fcxSvjbeIuqsO
zu$2G=t_2oy&`5Q+Gb>^{ujyS@3M6q!GRVrVaoax2l?mW;;bz1k{28d;++m}*zekT`
zJm6=P#QeNvdVug#pz;IGR4LFT6@MxYvhE&?_9c9TqvZ}9I;RQy?MpqXO8bQqF}8ng
zitZ^-{Fc2!i2UB}HOkn%(Kqu*L40u}tVnn54;x?^mn9#F-!5_!z32ARjf#z6+bI`>
zr`xSW(?9c5hup_
zNTYN92>0}i`lh%mw6(v5&4~+m|8@adRA81_W)UQa+*lwHFx|RJ7Xs(-Ues~zH>c~v
zkdgz&@o$C_K*EzNRiZv?>a&+(>?)GMhILG=qZ4Mj#$>8F6xiPx{DncBXzGNn&M)Ub#129QbZh##7B4rI-d6B>(}o&n
zEBF1k96K&9(0Tb*mYt|tw(cGGP^8g`w$wid|c!v|D&MC*f01FyO-@tA_#pJ62|
zEz#x2-^QLOV6Fv@b16HuXP4tkZp9g?g>bKwYsID8jtR9@-5agrW#!g{E_=R=2Yi%g
zOXIffoB2cxr@d}PdM(^nfpobdcT#iG_pzRE-*25B-38Yu5beVL&D_@DYoK5H7%eH#
z1?#y@+c`+LI+pA2{^`2E^T|=Ij*l$zwNp3yUHB!ZcL*5HKf%j|uW)*eDO#?yV`8gs
zaVQH5u@soG1s3L3FW&rd)8^29>QW!P^XsO26Yj5#O@AUD;&L(U{co*%-9KRgvm+yo
zlq;Xb^x;Cs-l<0_f5eGNhGujG;N$n6tP+b+k0Ya0Br5IA0nSIan(Ds_Y>s9(dYWI5
zahOg!vSoODtVRGPsI=jtM$QCdU~GP8x1`K|d6{*q6(u$7@I<-axz1LcCdl1z`+}FX
zEH}+7KcG#3M^)Z&%J&xwLqM9>IgNSBp5}&{E0u#Rc@nFh=XD?g?RxD20k=ifEK|9D
zS%0k^62VoJ)Ka?_+)Voe`@AM^(J^ez9aMwNm8n@*=I*XFw-ec8x73G1rioi$2H?0%
zK9Hf%7&<2G^=@Dh=QJP5K8$c&(_UM&oX{LEHL!A$D+qli!q)|!RS5D^{E4Xq=8
zA=?>Dk^6mK8t0KOqhql?TJ8T+WyW9Bc;
zPh@rZoV1BoUw5Gs4g3_>w9+cLV^$uEMv_nmTL$x(%D=h*QSo2?;0tJ4+BgGb$wK!Y
zT_eLS_f&V1rN$Q+Ix2g%>S^z5kB~Wi81~lf+=>S$=XFO^7@+=eVS&l;7?8r)%$cX2
z25&Qn=f8|%dJ2Y*&ql_k0jTBc#rmal9{}n#XqK`3VsY0hUme@)X0Ml>rSQq}51V$Li#lsGLI7KFWFI$`F^4l2mbd&l!;M|*tzK87G#;L)8qvoX
zmu>`PMMB2!evGXtC@^?X3*{DfTNf)!RWD#@8BgZu6aAV!Hl5&%R)d`4!*uZO`~J^C
z*ZAf1;($4eM-cLh&P)5YzC~;;RSx%Z3yi&Xo%cw5{wzf@);AzNx>XV7Ynio55GhZt
z$CW)kR@{&bp@)@tECjE?D(Kb5;*9-9w^5;2I`SZXzB9uD;LHwpKkcXA*JsXfm{&%E
z$7IY8mLF8Sk|