diff --git a/.appveyor.yml b/.appveyor.yml
deleted file mode 100644
index 743eee76..00000000
--- a/.appveyor.yml
+++ /dev/null
@@ -1,52 +0,0 @@
-# Documentation: http://www.appveyor.com/docs/appveyor-yml/
-image: Visual Studio 2017
-
-cache: c:\tools\vcpkg\installed\
-
-platform: x64
-
-environment:
-  matrix:
-  - julia_version: 1.0
-  - julia_version: 1.1
-  - julia_version: 1.2
-  - julia_version: 1.3
-  - julia_version: nightly
-
-# # Uncomment the following lines to allow failures on nightly julia
-# # (tests will run but not make your overall status red)
-matrix:
-  allow_failures:
-    - julia_version: nightly
-
-branches:
-  only:
-    - master
-    - /release-.*/
-
-notifications:
-  - provider: Email
-    on_build_success: false
-    on_build_failure: false
-    on_build_status_changed: false
-
-install:
-  - ps: iex ((new-object net.webclient).DownloadString("https://raw.githubusercontent.com/JuliaCI/Appveyor.jl/version-1/bin/install.ps1"))
-  - set PATH=%PATH%;c:\tools\vcpkg
-  - set PATH=%PATH%;C:\mingw-w64\x86_64-7.2.0-posix-seh-rt_v5-rev1\mingw64\bin\
-  - vcpkg install openblas:x64-windows fftw3[core,threads]:x64-windows mpfr:x64-windows mpir:x64-windows --recurse
-  - set PATH=c:\tools\vcpkg\installed\x64-windows\bin;%PATH%
-
-build_script:
-  - echo "%JL_BUILD_SCRIPT%"
-  - C:\julia\bin\julia -e "%JL_BUILD_SCRIPT%"
-
-test_script:
-  - echo "%JL_TEST_SCRIPT%"
-  - C:\julia\bin\julia -e "%JL_TEST_SCRIPT%"
-
-# # Uncomment to support code coverage upload. Should only be enabled for packages
-# # which would have coverage gaps without running on Windows
-# on_success:
-#   - echo "%JL_CODECOV_SCRIPT%"
-#   - C:\julia\bin\julia -e "%JL_CODECOV_SCRIPT%"
diff --git a/.github/workflows/CIWindows.yml b/.github/workflows/CIWindows.yml
new file mode 100644
index 00000000..b0657745
--- /dev/null
+++ b/.github/workflows/CIWindows.yml
@@ -0,0 +1,40 @@
+name: CI Windows
+on:
+  - push
+  - pull_request
+jobs:
+  testwindows:
+    name: Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }}
+    runs-on: ${{ matrix.os }}
+    strategy:
+      matrix:
+        version:
+          - '1'
+        os:
+          - windows-latest
+        arch:
+          - x86
+          - x64
+    steps:
+      - uses: actions/checkout@v3
+      - uses: julia-actions/setup-julia@v1
+        with:
+          version: ${{ matrix.version }}
+          arch: ${{ matrix.arch }}
+          show-versioninfo: true
+      - uses: actions/cache@v3
+        env:
+          cache-name: cache-artifacts
+        with:
+          path: ~/.julia/artifacts
+          key: ${{ runner.os }}-test-${{ env.cache-name }}-${{ hashFiles('**/Project.toml') }}
+          restore-keys: |
+            ${{ runner.os }}-test-${{ env.cache-name }}-
+            ${{ runner.os }}-test-
+            ${{ runner.os }}-
+      - uses: julia-actions/julia-buildpkg@latest
+      - uses: julia-actions/julia-runtest@latest
+      - uses: julia-actions/julia-processcoverage@v1
+      - uses: codecov/codecov-action@v3
+        with:
+          file: lcov.info
diff --git a/.github/workflows/CompatHelper.yml b/.github/workflows/CompatHelper.yml
index 179e7a99..09181610 100644
--- a/.github/workflows/CompatHelper.yml
+++ b/.github/workflows/CompatHelper.yml
@@ -1,25 +1,45 @@
 name: CompatHelper
-
 on:
   schedule:
-    - cron: '00 * * * *'
-
+    - cron: 0 0 * * *
+  workflow_dispatch:
+permissions:
+  contents: write
+  pull-requests: write
 jobs:
-  build:
-    runs-on: ${{ matrix.os }}
-    strategy:
-      matrix:
-        julia-version: [1.2.0]
-        julia-arch: [x86]
-        os: [ubuntu-latest]
+  CompatHelper:
+    runs-on: ubuntu-latest
     steps:
-      - uses: julia-actions/setup-julia@latest
+      - name: Check if Julia is already available in the PATH
+        id: julia_in_path
+        run: which julia
+        continue-on-error: true
+      - name: Install Julia, but only if it is not already available in the PATH
+        uses: julia-actions/setup-julia@v1
         with:
-          version: ${{ matrix.julia-version }}
-      - name: Install dependencies
-        run: julia -e 'using Pkg; Pkg.add(Pkg.PackageSpec(name = "CompatHelper", url = "https://github.com/bcbi/CompatHelper.jl.git"))'
-      - name: CompatHelper.main
+          version: '1'
+          arch: ${{ runner.arch }}
+        if: steps.julia_in_path.outcome != 'success'
+      - name: "Add the General registry via Git"
+        run: |
+          import Pkg
+          ENV["JULIA_PKG_SERVER"] = ""
+          Pkg.Registry.add("General")
+        shell: julia --color=yes {0}
+      - name: "Install CompatHelper"
+        run: |
+          import Pkg
+          name = "CompatHelper"
+          uuid = "aa819f21-2bde-4658-8897-bab36330d9b7"
+          version = "3"
+          Pkg.add(; name, uuid, version)
+        shell: julia --color=yes {0}
+      - name: "Run CompatHelper"
+        run: |
+          import CompatHelper
+          CompatHelper.main()
+        shell: julia --color=yes {0}
         env:
           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-          JULIA_DEBUG: CompatHelper
-        run: julia -e 'using CompatHelper; CompatHelper.main()'
+          COMPATHELPER_PRIV: ${{ secrets.DOCUMENTER_KEY }}
+          # COMPATHELPER_PRIV: ${{ secrets.COMPATHELPER_PRIV }}
diff --git a/.github/workflows/TagBot.yml b/.github/workflows/TagBot.yml
new file mode 100644
index 00000000..f49313b6
--- /dev/null
+++ b/.github/workflows/TagBot.yml
@@ -0,0 +1,15 @@
+name: TagBot
+on:
+  issue_comment:
+    types:
+      - created
+  workflow_dispatch:
+jobs:
+  TagBot:
+    if: github.event_name == 'workflow_dispatch' || github.actor == 'JuliaTagBot'
+    runs-on: ubuntu-latest
+    steps:
+      - uses: JuliaRegistries/TagBot@v1
+        with:
+          token: ${{ secrets.GITHUB_TOKEN }}
+          ssh: ${{ secrets.DOCUMENTER_KEY }}
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
new file mode 100644
index 00000000..4991c31d
--- /dev/null
+++ b/.github/workflows/ci.yml
@@ -0,0 +1,47 @@
+name: CI
+on:
+  - push
+  - pull_request
+jobs:
+  test:
+    name: Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }}
+    runs-on: ${{ matrix.os }}
+    strategy:
+      fail-fast: false
+      matrix:
+        version:
+          - 'lts'
+          - '1'
+        os:
+          - ubuntu-latest
+#          - macOS-latest
+        arch:
+          - x86
+          - x64
+        exclude:
+          - os: macOS-latest
+            arch: x86
+    steps:
+      - uses: actions/checkout@v4
+      - uses: julia-actions/setup-julia@v2
+        with:
+          version: ${{ matrix.version }}
+          arch: ${{ matrix.arch }}
+          show-versioninfo: true
+      - uses: actions/cache@v3
+        env:
+          cache-name: cache-artifacts
+        with:
+          path: ~/.julia/artifacts
+          key: ${{ runner.os }}-test-${{ env.cache-name }}-${{ hashFiles('**/Project.toml') }}
+          restore-keys: |
+            ${{ runner.os }}-test-${{ env.cache-name }}-
+            ${{ runner.os }}-test-
+            ${{ runner.os }}-
+      - uses: julia-actions/julia-buildpkg@latest
+      - uses: julia-actions/julia-runtest@latest
+      - uses: julia-actions/julia-processcoverage@v1
+      - uses: codecov/codecov-action@v4
+        with:
+          token: ${{ secrets.CODECOV_TOKEN }}
+          file: lcov.info
diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml
new file mode 100644
index 00000000..c4f06471
--- /dev/null
+++ b/.github/workflows/docs.yml
@@ -0,0 +1,17 @@
+name: Documentation
+on:
+  - push
+  - pull_request
+jobs:
+  docs:
+    name: Documentation
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v3
+      - uses: julia-actions/setup-julia@v1
+        with:
+          version: '1'
+      - uses: julia-actions/julia-docdeploy@releases/v1
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+          DOCUMENTER_KEY: ${{ secrets.DOCUMENTER_KEY }}
diff --git a/.github/workflows/downstream.yml b/.github/workflows/downstream.yml
new file mode 100644
index 00000000..ba2c1dfa
--- /dev/null
+++ b/.github/workflows/downstream.yml
@@ -0,0 +1,78 @@
+name: IntegrationTest
+on:
+  push:
+    branches: [master]
+    tags: [v*]
+    paths-ignore:
+      - 'LICENSE'
+      - 'README.md'
+      - '.github/workflows/TagBot.yml'
+  pull_request:
+    paths-ignore:
+      - 'LICENSE'
+      - 'README.md'
+      - '.github/workflows/TagBot.yml'
+
+concurrency:
+  group: build-${{ github.event.pull_request.number || github.ref }}-${{ github.workflow }}
+  cancel-in-progress: true
+
+jobs:
+  pre_job:
+    # continue-on-error: true # Uncomment once integration is finished
+    runs-on: ubuntu-latest
+    # Map a step output to a job output
+    outputs:
+      should_skip: ${{ steps.skip_check.outputs.should_skip }}
+    steps:
+      - id: skip_check
+        uses: fkirc/skip-duplicate-actions@v5
+  test:
+    needs: pre_job
+    if: needs.pre_job.outputs.should_skip != 'true'
+    name: ${{ matrix.package.group }}/${{ matrix.package.repo }}/${{ matrix.julia-version }}
+    runs-on: ${{ matrix.os }}
+    strategy:
+      fail-fast: false
+      matrix:
+        julia-version: ['1']
+        os: [ubuntu-latest]
+        package:
+          - {repo: ClassicalOrthogonalPolynomials.jl, group: JuliaApproximation}
+          - {repo: MultivariateOrthogonalPolynomials.jl, group: JuliaApproximation}
+          - {repo: ApproxFun.jl, group: JuliaApproximation}
+
+    steps:
+      - uses: actions/checkout@v4
+      - uses: julia-actions/setup-julia@v2
+        with:
+          version: ${{ matrix.julia-version }}
+          arch: x64
+      - uses: julia-actions/julia-buildpkg@latest
+      - name: Clone Downstream
+        uses: actions/checkout@v4
+        with:
+          repository: ${{ matrix.package.group }}/${{ matrix.package.repo }}
+          path: downstream
+      - name: Load this and run the downstream tests
+        shell: julia --color=yes --project=downstream {0}
+        run: |
+          using Pkg
+          try
+            # force it to use this PR's version of the package
+            Pkg.develop(PackageSpec(path="."))  # resolver may fail with main deps
+            Pkg.update()
+            Pkg.test(; coverage = true)  # resolver may fail with test time deps
+          catch err
+            err isa Pkg.Resolve.ResolverError || rethrow()
+            # If we can't resolve that means this is incompatible by SemVer and this is fine
+            # It means we marked this as a breaking change, so we don't need to worry about
+            # Mistakenly introducing a breaking change, as we have intentionally made one
+            @info "Not compatible with this release. No problem." exception=err
+            exit(0)  # Exit immediately, as a success
+          end
+      - uses: julia-actions/julia-processcoverage@v1
+      - uses: codecov/codecov-action@v4
+        with:
+          token: ${{ secrets.CODECOV_TOKEN }}
+          files: lcov.info
diff --git a/.gitignore b/.gitignore
index 2a0e93eb..0a0d3048 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,6 +1,7 @@
 docs/build/
-docs/site/
+docs/src/generated
 deps/build.log
 deps/libfasttransforms.*
 .DS_Store
 deps/FastTransforms/
+Manifest.toml
diff --git a/.travis.yml b/.travis.yml
deleted file mode 100644
index bab47058..00000000
--- a/.travis.yml
+++ /dev/null
@@ -1,26 +0,0 @@
-# Documentation: http://docs.travis-ci.com/user/languages/julia/
-language: julia
-os:
-  - linux
-  - osx
-julia:
-  - 1.0
-  - 1.1
-  - 1.2
-  - 1.3
-  - nightly
-matrix:
-  allow_failures:
-    - julia: nightly
-addons:
-  apt:
-    packages: ['libquadmath0', 'libgomp1', 'libopenblas-dev', 'libfftw3-dev', 'libmpfr-dev']
-  homebrew:
-    packages: ['gcc@8', 'fftw', 'mpfr']
-    update: true
-
-notifications:
-  email: false
-after_success:
-  - julia -e 'using Pkg; cd(Pkg.dir("FastTransforms")); Pkg.add("Documenter"); include(joinpath("docs", "make.jl"))'
-  - julia -e 'using Pkg; cd(Pkg.dir("FastTransforms")); Pkg.add("Coverage"); using Coverage; Codecov.submit(Codecov.process_folder())'
diff --git a/Project.toml b/Project.toml
index 39a2acf6..2959561e 100644
--- a/Project.toml
+++ b/Project.toml
@@ -1,27 +1,42 @@
 name = "FastTransforms"
 uuid = "057dd010-8810-581a-b7be-e3fc3b93f78c"
-version = "0.7.0"
+version = "0.17"
+
 
 [deps]
 AbstractFFTs = "621f4979-c628-5d54-868e-fcf4e3e8185c"
-BinaryProvider = "b99e7846-7c00-51b0-8f62-c81ae34c0232"
-DSP = "717857b8-e6f2-59f4-9121-6e50c889abd2"
+ArrayLayouts = "4c555306-a7a7-4459-81d9-ec55ddd5c99a"
+BandedMatrices = "aae01518-5342-5314-be14-df237901396f"
 FFTW = "7a1cc6ca-52ef-59f5-83cd-3a7055c09341"
 FastGaussQuadrature = "442a2c76-b920-505d-bb47-c5924d526838"
+FastTransforms_jll = "34b6f7d7-08f9-5794-9e10-3819e4c7e49a"
+FillArrays = "1a297f60-69ca-5386-bcde-b61e274b549b"
+GenericFFT = "a8297547-1b15-4a5a-a998-a2ac5f1cef28"
+LazyArrays = "5078a376-72f3-5289-bfd5-ec5146d43c02"
 Libdl = "8f399da3-3557-5675-b5ff-fb832c97cbdb"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
-Reexport = "189a3867-3050-52da-a836-e630ba90ab69"
+RecurrenceRelationships = "807425ed-42ea-44d6-a357-6771516d7b2c"
 SpecialFunctions = "276daf66-3868-5448-9aa4-cd146d93841b"
-Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 ToeplitzMatrices = "c751599d-da0a-543b-9d20-d0a503d91d24"
 
 [compat]
-AbstractFFTs = "0.4, 0.5"
-BinaryProvider = "0.5.8"
-DSP = "0.6"
-FFTW = "1"
-FastGaussQuadrature = "0.4"
-Reexport = "0.2"
-SpecialFunctions = "0.8"
-ToeplitzMatrices = "0.6"
-julia = "1"
+AbstractFFTs = "1.0"
+ArrayLayouts = "1.10"
+BandedMatrices = "1.5"
+FFTW = "1.7"
+FastGaussQuadrature = "0.4, 0.5, 1"
+FastTransforms_jll = "0.6.2"
+FillArrays = "0.9, 0.10, 0.11, 0.12, 0.13, 1"
+GenericFFT = "0.1"
+LazyArrays = "2.2"
+RecurrenceRelationships = "0.2"
+SpecialFunctions = "0.10, 1, 2"
+ToeplitzMatrices = "0.7.1, 0.8"
+julia = "1.7"
+
+[extras]
+Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
+Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
+
+[targets]
+test = ["Test", "Random"]
diff --git a/README.md b/README.md
index 6ad759d8..d2dfe381 100644
--- a/README.md
+++ b/README.md
@@ -1,6 +1,7 @@
 # FastTransforms.jl
 
-[![Travis](https://travis-ci.org/JuliaApproximation/FastTransforms.jl.svg?branch=master)](https://travis-ci.org/JuliaApproximation/FastTransforms.jl) [![AppVeyor](https://ci.appveyor.com/api/projects/status/oba9qush15q3x8pb/branch/master?svg=true)](https://ci.appveyor.com/project/MikaelSlevinsky/fasttransforms-jl/branch/master) [![codecov](https://codecov.io/gh/JuliaApproximation/FastTransforms.jl/branch/master/graph/badge.svg)](https://codecov.io/gh/JuliaApproximation/FastTransforms.jl) [![](https://img.shields.io/badge/docs-stable-blue.svg)](https://JuliaApproximation.github.io/FastTransforms.jl/stable) [![](https://img.shields.io/badge/docs-latest-blue.svg)](https://JuliaApproximation.github.io/FastTransforms.jl/latest)
+[![Build Status](https://github.com/JuliaApproximation/FastTransforms.jl/workflows/CI/badge.svg)](https://github.com/JuliaApproximation/FastTransforms.jl/actions?query=workflow%3ACI) [![codecov](https://codecov.io/gh/JuliaApproximation/FastTransforms.jl/branch/master/graph/badge.svg?token=BxTvSNgmLL)](https://codecov.io/gh/JuliaApproximation/FastTransforms.jl) [![](https://img.shields.io/badge/docs-stable-blue.svg)](https://JuliaApproximation.github.io/FastTransforms.jl/stable) [![](https://img.shields.io/badge/docs-dev-blue.svg)](https://JuliaApproximation.github.io/FastTransforms.jl/dev)
+[![pkgeval](https://juliahub.com/docs/General/FastTransforms/stable/pkgeval.svg)](https://juliaci.github.io/NanosoldierReports/pkgeval_badges/report.html)
 
 `FastTransforms.jl` allows the user to conveniently work with orthogonal polynomials with degrees well into the millions.
 
@@ -8,7 +9,7 @@ This package provides a Julia wrapper for the [C library](https://github.com/Mik
 
 ## Installation
 
-The build script, which works on macOS, Linux, and Windows systems, downloads precompiled binaries of the latest version of [FastTransforms](https://github.com/MikaelSlevinsky/FastTransforms) or builds them from source, as governed by the environment variable `FT_BUILD_FROM_SOURCE=true/false`. This library depends on `FFTW`, `MPFR`, and `OpenBLAS` (on Linux and Windows), which must be present for a successful build. With dependencies, installation may be as straightforward as:
+Installation, which uses [BinaryBuilder](https://github.com/JuliaPackaging/BinaryBuilder.jl) for all of Julia's supported platforms (in particular Sandybridge Intel processors and beyond), may be as straightforward as:
 
 ```julia
 pkg> add FastTransforms
@@ -19,7 +20,7 @@ julia> using FastTransforms, LinearAlgebra
 
 ## Fast orthogonal polynomial transforms
 
-The 26 orthogonal polynomial transforms are listed in `FastTransforms.kind2string.(0:25)`. Univariate transforms may be planned with the standard normalization or with orthonormalization. For multivariate transforms, the standard normalization may be too severe for floating-point computations, so it is omitted. Here are two examples:
+The orthogonal polynomial transforms are listed in `FastTransforms.Transforms` or `FastTransforms.kind2string.(instances(FastTransforms.Transforms))`. Univariate transforms may be planned with the standard normalization or with orthonormalization. For multivariate transforms, the standard normalization may be too severe for floating-point computations, so it is omitted. Here are two examples:
 
 ### The Chebyshev--Legendre transform
 
@@ -95,6 +96,8 @@ julia> norm(F-H)/norm(F)
 
 Due to the structure of the spherical harmonic connection problem, these transforms may also be performed in-place with `lmul!` and `ldiv!`.
 
+See also [FastSphericalHarmonics.jl](https://github.com/eschnett/FastSphericalHarmonics.jl) for a simpler interface to the spherical harmonic transforms defined in this package.
+
 ## Nonuniform fast Fourier transforms
 
 The NUFFTs are implemented thanks to [Alex Townsend](https://github.com/ajt60gaibb):
@@ -155,20 +158,16 @@ julia> @time norm(ipaduatransform(paduatransform(v)) - v)/norm(v)
 
 ```
 
-# References:
-
-   [1]  B. Alpert and V. Rokhlin. <a href="http://dx.doi.org/10.1137/0912009">A fast algorithm for the evaluation of Legendre expansions</a>, *SIAM J. Sci. Stat. Comput.*, **12**:158—179, 1991.
-
-   [2]  N. Hale and A. Townsend. <a href="http://dx.doi.org/10.1137/130932223">A fast, simple, and stable Chebyshev—Legendre transform using an asymptotic formula</a>, *SIAM J. Sci. Comput.*, **36**:A148—A167, 2014.
+# References
 
-   [3]  J. Keiner. <a href="http://dx.doi.org/10.1137/070703065">Computing with expansions in Gegenbauer polynomials</a>, *SIAM J. Sci. Comput.*, **31**:2151—2171, 2009.
+[1]  D. Ruiz—Antolín and A. Townsend, [A nonuniform fast Fourier transform based on low rank approximation](https://doi.org/10.1137/17M1134822), *SIAM J. Sci. Comput.*, **40**:A529–A547, 2018.
 
-   [4]  D. Ruiz—Antolín and A. Townsend. <a href="https://arxiv.org/abs/1701.04492">A nonuniform fast Fourier transform based on low rank approximation</a>, arXiv:1701.04492, 2017.
+[2] K. Gumerov, S. Rigg, and R. M. Slevinsky, [Fast measure modification of orthogonal polynomials via matrices with displacement structure](https://arxiv.org/abs/2412.17663), arXiv:2412.17663, 2024.
 
-   [5]  R. M. Slevinsky. <a href="https://doi.org/10.1093/imanum/drw070">On the use of Hahn's asymptotic formula and stabilized recurrence for a fast, simple, and stable Chebyshev—Jacobi transform</a>, *IMA J. Numer. Anal.*, **38**:102—124, 2018.
+[3] T. S. Gutleb, S. Olver and R. M. Slevinsky, [Polynomial and rational measure modifications of orthogonal polynomials via infinite-dimensional banded matrix factorizations](https://arxiv.org/abs/2302.08448), arXiv:2302.08448, 2023.
 
-   [6]  R. M. Slevinsky. <a href="https://doi.org/10.1016/j.acha.2017.11.001">Fast and backward stable transforms between spherical harmonic expansions and bivariate Fourier series</a>, *Appl. Comput. Harmon. Anal.*, **47**:585—606, 2019.
+[4] S. Olver, R. M. Slevinsky, and A. Townsend, [Fast algorithms using orthogonal polynomials](https://doi.org/10.1017/S0962492920000045), *Acta Numerica*, **29**:573—699, 2020.
 
-   [7]  R. M. Slevinsky, <a href="https://arxiv.org/abs/1711.07866">Conquering the pre-computation in two-dimensional harmonic polynomial transforms</a>, arXiv:1711.07866, 2017.
+[5]  R. M. Slevinsky, [Fast and backward stable transforms between spherical harmonic expansions and bivariate Fourier series](https://doi.org/10.1016/j.acha.2017.11.001), *Appl. Comput. Harmon. Anal.*, **47**:585—606, 2019.
 
-   [8]  A. Townsend, M. Webb, and S. Olver. <a href="https://doi.org/10.1090/mcom/3277">Fast polynomial transforms based on Toeplitz and Hankel matrices</a>, in press at *Math. Comp.*, 2017.
+[6]  R. M. Slevinsky, [Conquering the pre-computation in two-dimensional harmonic polynomial transforms](https://arxiv.org/abs/1711.07866), arXiv:1711.07866, 2017.
diff --git a/deps/build.jl b/deps/build.jl
index 7b419f95..76b27a4e 100644
--- a/deps/build.jl
+++ b/deps/build.jl
@@ -1,33 +1,6 @@
-using BinaryProvider
-import Libdl
-
-version = v"0.2.9"
-
-if arch(platform_key_abi()) != :x86_64
-    @warn "FastTransforms has only been tested on x86_64 architectures."
-end
-
-const extension = Sys.isapple() ? "dylib" : Sys.islinux() ? "so" : Sys.iswindows() ? "dll" : ""
-
-print_error() = error(
-    "FastTransforms could not be properly installed.\n Please check that you have all dependencies installed. " *
-    "Sample installation of dependencies:\n" *
-    print_platform_error(platform_key_abi())
-)
-
-print_platform_error(p::Platform) = "On $(BinaryProvider.platform_name(p)), please consider opening a pull request to add support.\n"
-print_platform_error(p::MacOS) = "On MacOS\n\tbrew install gcc@8 fftw mpfr\n"
-print_platform_error(p::Linux) = "On Linux\n\tsudo apt-get install gcc-8 libblas-dev libopenblas-base libfftw3-dev libmpfr-dev\n"
-print_platform_error(p::Windows) = "On Windows\n\tvcpkg install openblas:x64-windows fftw3[core,threads]:x64-windows mpir:x64-windows mpfr:x64-windows\n"
-
-# Rationale is as follows: The build is pretty fast, so on Linux it is typically easiest
-# to just use the gcc of the system to build the library and include it. On MacOS, however,
-# we need to actually install a gcc first, because Apple's OS comes only shipped with clang,
-# so here we download the binary.
-ft_build_from_source = get(ENV, "FT_BUILD_FROM_SOURCE", Sys.isapple() ? "false" : "true")
-if ft_build_from_source == "true"
+if get(ENV, "FT_BUILD_FROM_SOURCE", "false") == "true"
+    extension = Sys.isapple() ? "dylib" : Sys.islinux() ? "so" : Sys.iswindows() ? "dll" : ""
     make = Sys.iswindows() ? "mingw32-make" : "make"
-    compiler = Sys.isapple() ? "CC=gcc-8" : "CC=gcc"
     flags = Sys.isapple() ? "FT_USE_APPLEBLAS=1" : Sys.iswindows() ? "FT_FFTW_WITH_COMBINED_THREADS=1" : ""
     script = """
         set -e
@@ -35,35 +8,31 @@ if ft_build_from_source == "true"
         if [ -d "FastTransforms" ]; then
             cd FastTransforms
             git fetch
-            git checkout v$version
+            git checkout master
+            git pull
             cd ..
         else
-            git clone -b v$version https://github.com/MikaelSlevinsky/FastTransforms.git FastTransforms
+            git clone https://github.com/MikaelSlevinsky/FastTransforms.git FastTransforms
         fi
         cd FastTransforms
-        $make lib $compiler $flags
+        $make assembly
+        $make lib $flags
         cd ..
         mv -f FastTransforms/libfasttransforms.$extension libfasttransforms.$extension
     """
     try
         run(`bash -c $(script)`)
     catch
-        print_error()
+        error(
+            "FastTransforms could not be properly installed.\n Please check that you have all dependencies installed. " *
+            "Sample installation of dependencies:\n" *
+            (Sys.isapple() ? "On MacOS\n\tbrew install libomp fftw mpfr\n" :
+             Sys.islinux() ? "On Linux\n\tsudo apt-get install libomp-dev libblas-dev libopenblas-base libfftw3-dev libmpfr-dev\n" :
+             Sys.iswindows() ? "On Windows\n\tvcpkg install openblas:x64-windows fftw3[core,threads]:x64-windows mpir:x64-windows mpfr:x64-windows\n" :
+             "On your platform, please consider opening a pull request to add support to build from source.\n")
+        )
     end
     println("FastTransforms built from source.")
 else
-    const GCC = BinaryProvider.detect_compiler_abi().gcc_version
-    namemap = Dict(:gcc4 => "gcc-4.9", :gcc5 => "gcc-5", :gcc6 => "gcc-6",
-                   :gcc7 => "gcc-7", :gcc8 => "gcc-8", :gcc9 => "gcc-9")
-    if !(GCC in keys(namemap))
-        error("Please ensure you have a version of gcc from gcc-4.9 to gcc-9.")
-    end
-    try
-        download("https://github.com/MikaelSlevinsky/FastTransforms/releases/download/" *
-                 "v$version/libfasttransforms.v$version.$(namemap[GCC]).$extension",
-                 joinpath(dirname(@__DIR__), "deps", "libfasttransforms.$extension"))
-    catch
-        print_error()
-    end
-    println("FastTransforms installed by downloading binaries.")
+    println("FastTransforms using precompiled binaries.")
 end
diff --git a/docs/Project.toml b/docs/Project.toml
new file mode 100644
index 00000000..169ffdb7
--- /dev/null
+++ b/docs/Project.toml
@@ -0,0 +1,12 @@
+[deps]
+Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4"
+FastTransforms = "057dd010-8810-581a-b7be-e3fc3b93f78c"
+LaTeXStrings = "b964fa9f-0449-5b57-a5c2-d3ea65f4040f"
+LazyArrays = "5078a376-72f3-5289-bfd5-ec5146d43c02"
+Literate = "98b081ad-f1c9-55d3-8b20-4c87d4299306"
+PlotlyJS = "f0f68f2c-4968-5e81-91da-67840de0976a"
+Plots = "91a5bcdd-55d7-5caf-9e0b-520d859cae80"
+
+[compat]
+Documenter = "~0.24"
+Literate = "~2.8"
diff --git a/docs/make.jl b/docs/make.jl
index 31eebc8d..dd530e8f 100644
--- a/docs/make.jl
+++ b/docs/make.jl
@@ -1,14 +1,59 @@
-using Documenter, FastTransforms
+using Documenter, FastTransforms, Literate, Plots
+
+plotlyjs()
+
+const EXAMPLES_DIR = joinpath(@__DIR__, "..", "examples")
+const OUTPUT_DIR   = joinpath(@__DIR__, "src/generated")
+
+examples = [
+    "annulus.jl",
+    "automaticdifferentiation.jl",
+    "chebyshev.jl",
+    "disk.jl",
+    "halfrange.jl",
+    "nonlocaldiffusion.jl",
+    "padua.jl",
+    "sphere.jl",
+    "spinweighted.jl",
+    "subspaceangles.jl",
+    "triangle.jl",
+]
+
+function uncomment_objects(str)
+    str = replace(str, "###```@raw" => "```\n\n```@raw")
+    str = replace(str, "###<object" => "<object")
+    str = replace(str, "###```\n```" => "```")
+    str
+end
+
+for example in examples
+    example_filepath = joinpath(EXAMPLES_DIR, example)
+    Literate.markdown(example_filepath, OUTPUT_DIR; execute=true, postprocess = uncomment_objects)
+end
 
 makedocs(
-			doctest = false,
-			format = Documenter.HTML(),
-			sitename = "FastTransforms.jl",
-			authors = "Richard Mikael Slevinsky",
-			pages = Any[
-					"Home" => "index.md"
-					]
-			)
+            doctest = false,
+            format = Documenter.HTML(),
+            sitename = "FastTransforms.jl",
+            authors = "Richard Mikael Slevinsky",
+            pages = Any[
+                    "Home" => "index.md",
+                    "Development" => "dev.md",
+                    "Examples" => [
+                        "generated/annulus.md",
+                        "generated/automaticdifferentiation.md",
+                        "generated/chebyshev.md",
+                        "generated/disk.md",
+                        "generated/halfrange.md",
+                        "generated/nonlocaldiffusion.md",
+                        "generated/padua.md",
+                        "generated/sphere.md",
+                        "generated/spinweighted.md",
+                        "generated/subspaceangles.md",
+                        "generated/triangle.md",
+                        ],
+                    ]
+        )
 
 
 deploydocs(
diff --git a/docs/src/dev.md b/docs/src/dev.md
new file mode 100644
index 00000000..80971817
--- /dev/null
+++ b/docs/src/dev.md
@@ -0,0 +1,80 @@
+# Development Documentation
+
+The core of [`FastTransforms.jl`](https://github.com/JuliaApproximation/FastTransforms.jl) is developed in parallel with the [C library](https://github.com/MikaelSlevinsky/FastTransforms) of the same name. Julia and C interoperability is enhanced by the [BinaryBuilder](https://github.com/JuliaPackaging/BinaryBuilder.jl) infrastructure, which provides the user a safe and seamless experience using a package in a different language.
+
+## Why two packages?
+
+Orthogonal polynomial transforms are performance-sensitive imperative tasks. Yet, many of Julia's rich and evolving language features are simply unnecessary for defining these computational routines. Moreover, rapid language changes in Julia (as compared to C) have been more than a perturbation to this repository in the past.
+
+The C library generates assembly for vectorized operations such as single instruction multiple data (SIMD) that is more efficient than that generated by a compiler without human intervention. It also uses OpenMP to introduce shared memory parallelism for large tasks. Finally, calling into precompiled binaries reduces the Julia package's pre-compilation and dependencies, improving the user experience. Some of these capabilities also exist in Julia, but with C there is frankly more control over performance.
+
+C libraries are easier to call from any other language, partly explaining why the Python package manager Spack [already supports the C library](https://spack.readthedocs.io/en/latest/package_list.html#fasttransforms) through third-party efforts.
+
+In Julia, a parametric composite type with unrestricted type parameters is just about as big as `Any`. Such a type allows the Julia API to far exceed the C API in its ability to unify all of the orthogonal polynomial transforms and present them as linear operators. The `mutable struct FTPlan{T, N, K}`, together with `AdjointFTPlan` and `TransposeFTPlan`, are the core Julia types in this repository. Whereas `T` is understood to represent element type of the plan and `N` represents the number of leading dimensions of the array on which it operates, `K` is a mere enumeration which serves to distinguish the orthogonal polynomials at play. For example, `FTPlan{Float64, 1, LEG2CHEB}` represents the necessary pre-computations to convert 64-bit Legendre series to Chebyshev series (of the first kind). `N == 1` because Chebyshev and Legendre series are naturally represented with vectors of coefficients. However, this particular plan may operate not only on vectors but also on matrices, column-by-column.
+
+## The developer's right to build from source
+
+Precompiled binaries are important for users, but development in C may be greatly accelerated by coupling it with a dynamic language such as Julia. For this reason, the repository preserves the developer's right to build the C library from source by setting an environment variable to trigger the build script:
+
+```julia
+julia> ENV["FT_BUILD_FROM_SOURCE"] = "true"
+"true"
+
+(@v1.5) pkg> build FastTransforms
+   Building FFTW ──────────→ `~/.julia/packages/FFTW/ayqyZ/deps/build.log`
+   Building TimeZones ─────→ `~/.julia/packages/TimeZones/K98G0/deps/build.log`
+   Building FastTransforms → `~/.julia/dev/FastTransforms/deps/build.log`
+
+julia> using FastTransforms
+[ Info: Precompiling FastTransforms [057dd010-8810-581a-b7be-e3fc3b93f78c]
+
+```
+
+This lets the developer experiment with new features through `ccall`ing into bleeding edge source code. Customizing the build script further allows the developer to track a different branch or even a fork.
+
+## From release to release to release
+
+To get from a C library release to a Julia package release, the developer needs to update Yggdrasil's [build_tarballs.jl](https://github.com/JuliaPackaging/Yggdrasil/blob/master/F/FastTransforms/build_tarballs.jl) script for the new version and its 256-bit SHA. On macOS, the SHA can be found by:
+
+```julia
+shell> curl https://codeload.github.com/MikaelSlevinsky/FastTransforms/tar.gz/v0.6.2 --output FastTransforms.tar.gz
+  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
+                                 Dload  Upload   Total   Spent    Left  Speed
+100  168k    0  168k    0     0   429k      0 --:--:-- --:--:-- --:--:--  429k
+
+shell> shasum -a 256 FastTransforms.tar.gz
+fd00befcb0c20ba962a8744a7b9139355071ee95be70420de005b7c0f6e023aa  FastTransforms.tar.gz
+
+shell> rm -f FastTransforms.tar.gz
+
+```
+
+Using [SHA.jl](https://github.com/JuliaCrypto/SHA.jl), the SHA can also be found by:
+
+```julia
+shell> curl https://codeload.github.com/MikaelSlevinsky/FastTransforms/tar.gz/v0.6.2 --output FastTransforms.tar.gz
+  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
+                                 Dload  Upload   Total   Spent    Left  Speed
+100  168k    0  168k    0     0   442k      0 --:--:-- --:--:-- --:--:--  443k
+
+julia> using SHA
+
+julia> open("FastTransforms.tar.gz") do f
+           bytes2hex(sha256(f))
+       end
+"fd00befcb0c20ba962a8744a7b9139355071ee95be70420de005b7c0f6e023aa"
+
+shell> rm -f FastTransforms.tar.gz
+
+```
+
+Then we wait for the friendly folks at [JuliaPackaging](https://github.com/JuliaPackaging) to merge the pull request to Yggdrasil, triggering a new release of the [FastTransforms_jll.jl](https://github.com/JuliaBinaryWrappers/FastTransforms_jll.jl) meta package that stores all precompiled binaries. With this release, we update the FastTransforms.jl [Project.toml](https://github.com/JuliaApproximation/FastTransforms.jl/blob/master/Project.toml) to point to the latest release and register the new version.
+
+Since development of Yggdrasil is quite rapid, a fork may easily become stale. Git permits the developer to forcibly make a master branch on a fork even with upstream master:
+
+```
+git fetch upstream
+git checkout master
+git reset --hard upstream/master
+git push origin master --force
+```
diff --git a/docs/src/index.md b/docs/src/index.md
index 2b2cfe51..3c2c2844 100644
--- a/docs/src/index.md
+++ b/docs/src/index.md
@@ -2,7 +2,7 @@
 
 ## Introduction
 
-`FastTransforms.jl` allows the user to conveniently work with orthogonal polynomials with degrees well into the millions.
+[`FastTransforms.jl`](https://github.com/JuliaApproximation/FastTransforms.jl) allows the user to conveniently work with orthogonal polynomials with degrees well into the millions.
 
 This package provides a Julia wrapper for the [C library](https://github.com/MikaelSlevinsky/FastTransforms) of the same name. Additionally, all three types of nonuniform fast Fourier transforms available, as well as the Padua transform.
 
@@ -10,33 +10,22 @@ This package provides a Julia wrapper for the [C library](https://github.com/Mik
 
 For this documentation, please see the documentation for [FastTransforms](https://github.com/MikaelSlevinsky/FastTransforms). Most transforms have separate forward and inverse plans. In some instances, however, the inverse is in the sense of least-squares, and therefore only the forward transform is planned.
 
-## Nonuniform fast Fourier transforms
+### Modified orthogonal polynomials via fast Cholesky factorization of the Gram matrix
 
 ```@docs
-nufft1
+GramMatrix
+ChebyshevGramMatrix
 ```
 
-```@docs
-nufft2
-```
+## Nonuniform fast Fourier transforms
 
 ```@docs
+nufft1
+nufft2
 nufft3
-```
-
-```@docs
 inufft1
-```
-
-```@docs
 inufft2
-```
-
-```@docs
 paduatransform
-```
-
-```@docs
 ipaduatransform
 ```
 
@@ -44,13 +33,7 @@ ipaduatransform
 
 ```@docs
 gaunt
-```
-
-```@docs
 paduapoints
-```
-
-```@docs
 sphevaluate
 ```
 
@@ -60,29 +43,11 @@ sphevaluate
 
 ```@docs
 FastTransforms.half
-```
-
-```@docs
 FastTransforms.two
-```
-
-```@docs
 FastTransforms.δ
-```
-
-```@docs
 FastTransforms.Λ
-```
-
-```@docs
 FastTransforms.lambertw
-```
-
-```@docs
 FastTransforms.pochhammer
-```
-
-```@docs
 FastTransforms.stirlingseries
 ```
 
@@ -90,48 +55,21 @@ FastTransforms.stirlingseries
 
 ```@docs
 FastTransforms.clenshawcurtisnodes
-```
-
-```@docs
 FastTransforms.clenshawcurtisweights
-```
-
-```@docs
 FastTransforms.fejernodes1
-```
-
-```@docs
 FastTransforms.fejerweights1
-```
-
-```@docs
 FastTransforms.fejernodes2
-```
-
-```@docs
 FastTransforms.fejerweights2
-```
-
-```@docs
 FastTransforms.chebyshevmoments1
-```
-
-```@docs
 FastTransforms.chebyshevjacobimoments1
-```
-
-```@docs
 FastTransforms.chebyshevlogmoments1
-```
-
-```@docs
 FastTransforms.chebyshevmoments2
-```
-
-```@docs
 FastTransforms.chebyshevjacobimoments2
+FastTransforms.chebyshevlogmoments2
 ```
 
+### Elliptic Submodule
+
 ```@docs
-FastTransforms.chebyshevlogmoments2
+FastTransforms.Elliptic
 ```
diff --git a/examples/annulus.jl b/examples/annulus.jl
new file mode 100644
index 00000000..a6d7445e
--- /dev/null
+++ b/examples/annulus.jl
@@ -0,0 +1,73 @@
+# # Integration on an annulus
+# In this example, we explore integration of the function:
+# ```math
+#   f(x,y) = \frac{x^3}{x^2+y^2-\frac{1}{4}},
+# ```
+# over the annulus defined by $\{(r,\theta) : \rho < r < 1, 0 < \theta < 2\pi\}$
+# with parameter $\rho = \frac{2}{3}$. We will calculate the integral:
+# ```math
+#   \int_0^{2\pi}\int_{\frac{2}{3}}^1 f(r\cos\theta,r\sin\theta)^2r{\rm\,d}r{\rm\,d}\theta,
+# ```
+# by analyzing the function in an annulus polynomial series.
+# We analyze the function on an $N\times M$ tensor product grid defined by:
+# ```math
+# \begin{aligned}
+# r_n & = \sqrt{\cos^2\left[(n+\tfrac{1}{2})\pi/2N\right] + \rho^2 \sin^2\left[(n+\tfrac{1}{2})\pi/2N\right]},\quad{\rm for}\quad 0\le n < N,\quad{\rm and}\\
+# \theta_m & = 2\pi m/M,\quad{\rm for}\quad 0\le m < M;
+# \end{aligned}
+# ```
+# we convert the function samples to Chebyshev×Fourier coefficients using
+# `plan_annulus_analysis`; and finally, we transform the Chebyshev×Fourier
+# coefficients to annulus polynomial coefficients using `plan_ann2cxf`.
+#
+# For the storage pattern of the arrays, please consult the
+# [documentation](https://MikaelSlevinsky.github.io/FastTransforms).
+
+using FastTransforms, LinearAlgebra, Plots
+const GENFIGS = joinpath(pkgdir(FastTransforms), "docs/src/generated")
+!isdir(GENFIGS) && mkdir(GENFIGS)
+plotlyjs()
+
+# Our function $f$ on the annulus:
+f = (x,y) -> x^3/(x^2+y^2-1/4)
+
+# The annulus polynomial degree:
+N = 8
+M = 4N-3
+
+# The annulus inner radius:
+ρ = 2/3
+
+# The radial grid:
+r = [begin t = (N-n-0.5)/(2N); ct = sinpi(t); st = cospi(t); sqrt(ct^2+ρ^2*st^2) end for n in 0:N-1]
+
+# The angular grid (mod $\pi$):
+θ = (0:M-1)*2/M
+
+# On the mapped tensor product grid, our function samples are:
+F = [f(r*cospi(θ), r*sinpi(θ)) for r in r, θ in θ]
+
+# We superpose a surface plot of $f$ on top of the grid:
+X = [r*cospi(θ) for r in r, θ in θ]
+Y = [r*sinpi(θ) for r in r, θ in θ]
+scatter3d(vec(X), vec(Y), vec(0F); markersize=0.75, markercolor=:red)
+surface!(X, Y, F; legend=false, xlabel="x", ylabel="y", zlabel="f")
+savefig(joinpath(GENFIGS, "annulus.html"))
+###```@raw html
+###<object type="text/html" data="../annulus.html" style="width:100%;height:400px;"></object>
+###```
+
+# We precompute an Annulus--Chebyshev×Fourier plan:
+α, β, γ = 0, 0, 0
+P = plan_ann2cxf(F, α, β, γ, ρ)
+
+# And an FFTW Chebyshev×Fourier analysis plan on the annulus:
+PA = plan_annulus_analysis(F, ρ)
+
+# Its annulus coefficients are:
+U = P\(PA*F)
+
+# The annulus coefficients are useful for integration.
+# The integral of $[f(x,y)]^2$ over the annulus is
+# approximately the square of the 2-norm of the coefficients:
+norm(U)^2, 5π/8*(1675/4536+9*log(3)/32-3*log(7)/32)
diff --git a/examples/automaticdifferentiation.jl b/examples/automaticdifferentiation.jl
new file mode 100644
index 00000000..2c5f8f82
--- /dev/null
+++ b/examples/automaticdifferentiation.jl
@@ -0,0 +1,43 @@
+# # Automatic differentiation through spherical harmonic transforms
+# This example finds a positive value of $\lambda$ in:
+# ```math
+# f(r) = \sin[\lambda (k\cdot r)],
+# ```
+# for some $k,r\in\mathbb{S}^2$ such that $\int_{\mathbb{S}^2} f^2 {\rm\,d}\Omega = 1$.
+# We do this by using derivative information through:
+# ```math
+# \dfrac{\partial f}{\partial \lambda} = (k\cdot r) \cos[\lambda (k\cdot r)].
+# ```
+
+using FastTransforms, LinearAlgebra
+
+# The colatitudinal grid (mod $\pi$):
+N = 15
+θ = (0.5:N-0.5)/N
+
+# The longitudinal grid (mod $\pi$):
+M = 2*N-1
+φ = (0:M-1)*2/M
+
+# We precompute a spherical harmonic--Fourier plan:
+P = plan_sph2fourier(Float64, N)
+
+# And an FFTW Fourier analysis plan on $\mathbb{S}^2$:
+PA = plan_sph_analysis(Float64, N, M)
+
+# Our choice of $k$ and angular parametrization of $r$:
+k = [2/7, 3/7, 6/7]
+r = (θ,φ) -> [sinpi(θ)*cospi(φ), sinpi(θ)*sinpi(φ), cospi(θ)]
+
+# Our initial guess for $\lambda$:
+λ = 1.0
+
+# Then we run Newton iteration and grab an espresso:
+for _ in 1:7
+    F = [sin(λ*(k⋅r(θ,φ))) for θ in θ, φ in φ]
+    Fλ = [(k⋅r(θ,φ))*cos(λ*(k⋅r(θ,φ))) for θ in θ, φ in φ]
+    U = P\(PA*F)
+    Uλ = P\(PA*Fλ)
+    global λ = λ - (norm(U)^2-1)/(2*sum(U.*Uλ))
+    println("λ: $(rpad(λ, 18)) and the 2-norm: $(rpad(norm(U), 18))")
+end
diff --git a/examples/chebyshev.jl b/examples/chebyshev.jl
index 313467c5..5f553b09 100644
--- a/examples/chebyshev.jl
+++ b/examples/chebyshev.jl
@@ -1,52 +1,46 @@
-#############
+# # Chebyshev transform
 # This demonstrates the Chebyshev transform and inverse transform,
 # explaining precisely the normalization and points
-#############
 
 using FastTransforms
-
-# first kind points -> first kind polynomials
 n = 20
-p_1 = chebyshevpoints(Float64, n; kind=1)
-f = exp.(p_1)
-f̌ = chebyshevtransform(f; kind=1)
 
+# First kind points $\to$ first kind polynomials
+p_1 = chebyshevpoints(Float64, n, Val(1))
+f = exp.(p_1)
+f̌ = chebyshevtransform(f, Val(1))
 f̃ = x -> [cos(k*acos(x)) for k=0:n-1]' * f̌
 f̃(0.1) ≈ exp(0.1)
 
-# first kind polynomials -> first kind points
-ichebyshevtransform(f̌; kind=1) ≈ exp.(p_1)
+# First kind polynomials $\to$ first kind points
+ichebyshevtransform(f̌, Val(1)) ≈ exp.(p_1)
 
-# second kind points -> first kind polynomials
-p_2 = chebyshevpoints(Float64, n; kind=2)
+# Second kind points $\to$ first kind polynomials
+p_2 = chebyshevpoints(Float64, n, Val(2))
 f = exp.(p_2)
-f̌ = chebyshevtransform(f; kind=2)
-
+f̌ = chebyshevtransform(f, Val(2))
 f̃ = x -> [cos(k*acos(x)) for k=0:n-1]' * f̌
 f̃(0.1) ≈ exp(0.1)
 
-# first kind polynomials -> second kind points
-ichebyshevtransform(f̌; kind=2) ≈ exp.(p_2)
+# First kind polynomials $\to$ second kind points
+ichebyshevtransform(f̌, Val(2)) ≈ exp.(p_2)
 
-
-# first kind points -> second kind polynomials
-n = 20
-p_1 = chebyshevpoints(Float64, n; kind=1)
+# First kind points $\to$ second kind polynomials
+p_1 = chebyshevpoints(Float64, n, Val(1))
 f = exp.(p_1)
-f̌ = chebyshevutransform(f; kind=1)
+f̌ = chebyshevutransform(f, Val(1))
 f̃ = x -> [sin((k+1)*acos(x))/sin(acos(x)) for k=0:n-1]' * f̌
 f̃(0.1) ≈ exp(0.1)
 
-# second kind polynomials -> first kind points
-ichebyshevutransform(f̌; kind=1) ≈ exp.(p_1)
-
+# Second kind polynomials $\to$ first kind points
+ichebyshevutransform(f̌, Val(1)) ≈ exp.(p_1)
 
-# second kind points -> second kind polynomials
-p_2 = chebyshevpoints(Float64, n; kind=2)[2:n-1]
+# Second kind points $\to$ second kind polynomials
+p_2 = chebyshevpoints(Float64, n, Val(2))[2:n-1]
 f = exp.(p_2)
-f̌ = chebyshevutransform(f; kind=2)
+f̌ = chebyshevutransform(f, Val(2))
 f̃ = x -> [sin((k+1)*acos(x))/sin(acos(x)) for k=0:n-3]' * f̌
 f̃(0.1) ≈ exp(0.1)
 
-# second kind polynomials -> second kind points
-ichebyshevutransform(f̌; kind=2) ≈ exp.(p_2)
+# Second kind polynomials $\to$ second kind points
+ichebyshevutransform(f̌, Val(2)) ≈ exp.(p_2)
diff --git a/examples/disk.jl b/examples/disk.jl
index 4a4b6c4d..19db1731 100644
--- a/examples/disk.jl
+++ b/examples/disk.jl
@@ -1,47 +1,127 @@
-#############
+# # Holomorphic integration on the unit disk
 # In this example, we explore integration of a harmonic function:
-#
-#   f(x,y) = (x^2-y^2+1)/[(x^2-y^2+1)^2+(2xy+1)^2],
-#
+# ```math
+#   f(x,y) = \frac{x^2-y^2+1}{(x^2-y^2+1)^2+(2xy+1)^2},
+# ```
 # over the unit disk. In this case, we know from complex analysis that the
-# integral of a holomorphic function is equal to π × f(0,0).
-# We analyze the function on an N×M tensor product grid defined by:
-#
-#   rₙ = cos[(n+1/2)π/2N], for 0 ≤ n < N, and
-#
-#   θₘ = 2π m/M, for 0 ≤ m < M;
-#
+# integral of a holomorphic function is equal to $\pi \times f(0,0)$.
+# We analyze the function on an $N\times M$ tensor product grid defined by:
+# ```math
+# \begin{aligned}
+# r_n & = \cos\left[(n+\tfrac{1}{2})\pi/2N\right],\quad{\rm for}\quad 0\le n < N,\quad{\rm and}\\
+# \theta_m & = 2\pi m/M,\quad{\rm for}\quad 0\le m < M;
+# \end{aligned}
+# ```
 # we convert the function samples to Chebyshev×Fourier coefficients using
 # `plan_disk_analysis`; and finally, we transform the Chebyshev×Fourier
-# coefficients to disk harmonic coefficients using `plan_disk2cxf`.
+# coefficients to Zernike polynomial coefficients using `plan_disk2cxf`.
 #
-# For the storage pattern of the arrays, please consult the documentation.
-#############
+# For the storage pattern of the arrays, please consult the
+# [documentation](https://MikaelSlevinsky.github.io/FastTransforms).
 
-using FastTransforms, LinearAlgebra
+using FastTransforms, LinearAlgebra, Plots
+const GENFIGS = joinpath(pkgdir(FastTransforms), "docs/src/generated")
+!isdir(GENFIGS) && mkdir(GENFIGS)
+plotlyjs()
 
+# Our function $f$ on the disk:
 f = (x,y) -> (x^2-y^2+1)/((x^2-y^2+1)^2+(2x*y+1)^2)
 
-N = 5
+# The Zernike polynomial degree:
+N = 15
 M = 4N-3
 
+# The radial grid:
 r = [sinpi((N-n-0.5)/(2N)) for n in 0:N-1]
-θ = (0:M-1)*2/M # mod π.
+
+# The angular grid (mod $\pi$):
+θ = (0:M-1)*2/M
 
 # On the mapped tensor product grid, our function samples are:
 F = [f(r*cospi(θ), r*sinpi(θ)) for r in r, θ in θ]
 
-P = plan_disk2cxf(F)
+# We superpose a surface plot of $f$ on top of the grid:
+X = [r*cospi(θ) for r in r, θ in θ]
+Y = [r*sinpi(θ) for r in r, θ in θ]
+scatter3d(vec(X), vec(Y), vec(0F); markersize=0.75, markercolor=:red)
+surface!(X, Y, F; legend=false, xlabel="x", ylabel="y", zlabel="f")
+savefig(joinpath(GENFIGS, "zernike.html"))
+###```@raw html
+###<object type="text/html" data="../zernike.html" style="width:100%;height:400px;"></object>
+###```
+
+# We precompute a (generalized) Zernike--Chebyshev×Fourier plan:
+α, β = 0, 0
+P = plan_disk2cxf(F, α, β)
+
+# And an FFTW Chebyshev×Fourier analysis plan on the disk:
 PA = plan_disk_analysis(F)
 
 # Its Zernike coefficients are:
 U = P\(PA*F)
 
-# The Zernike coefficients are useful for integration. The integral of f(x,y)
-# over the disk should be π/2 by harmonicity. The coefficient of Z_0^0
-# multiplied by √π is:
+# The Zernike coefficients are useful for integration. The integral of $f(x,y)$
+# over the disk should be $\pi/2$ by harmonicity. The coefficient of $Z_{0,0}$
+# multiplied by `√π` is:
+U[1, 1]*sqrt(π)
+
+# Using an orthonormal basis, the integral of $[f(x,y)]^2$ over the disk is
+# approximately the square of the 2-norm of the coefficients:
+norm(U)^2, π/(2*sqrt(2))*log1p(sqrt(2))
+
+# But there's more! Next, we repeat the experiment using the Dunkl-Xu
+# orthonormal polynomials supported on the rectangularized disk.
+N = 2N
+M = N
+
+# We analyze the function on an $N\times M$ mapped tensor product $xy$-grid defined by:
+# ```math
+# \begin{aligned}
+# x_n & = \cos\left(\frac{2n+1}{2N}\pi\right) = \sin\left(\frac{N-2n-1}{2N}\pi\right),\quad {\rm for} \quad 0 \le n < N,\quad{\rm and}\\
+# z_m & = \cos\left(\frac{2m+1}{2M}\pi\right) = \sin\left(\frac{M-2m-1}{2M}\pi\right),\quad {\rm for} \quad 0 \le m < M,\\
+# y_{n,m} & = \sqrt{1-x_n^2}z_m.
+# \end{aligned}
+# ```
+# Slightly more accuracy can be expected by using an auxiliary array:
+# ```math
+#   w_n = \sin\left(\frac{2n+1}{2N}\pi\right),\quad {\rm for} \quad 0 \le n < N,
+# ```
+# so that $y_{n,m} = w_nz_m$.
+#
+# The x grid
+w = [sinpi((n+0.5)/N) for n in 0:N-1]
+x = [sinpi((N-2n-1)/(2N)) for n in 0:N-1]
+
+# The z grid
+z = [sinpi((M-2m-1)/(2M)) for m in 0:M-1]
+
+# On the mapped tensor product grid, our function samples are:
+F = [f(x[n], w[n]*z) for n in 1:N, z in z]
+
+# We superpose a surface plot of $f$ on top of the grid:
+X = [x for x in x, z in z]
+Y = [w*z for w in w, z in z]
+scatter3d(vec(X), vec(Y), vec(0F); markersize=0.75, markercolor=:green)
+surface!(X, Y, F; legend=false, xlabel="x", ylabel="y", zlabel="f")
+savefig(joinpath(GENFIGS, "dunklxu.html"))
+###```@raw html
+###<object type="text/html" data="../dunklxu.html" style="width:100%;height:400px;"></object>
+###```
+
+# We precompute a Dunkl-Xu--Chebyshev plan:
+P = plan_rectdisk2cheb(F, β)
+
+# And an FFTW Chebyshev² analysis plan on the rectangularized disk:
+PA = plan_rectdisk_analysis(F)
+
+# Its Dunkl-Xu coefficients are:
+U = P\(PA*F)
+
+# The Dunkl-Xu coefficients are useful for integration. The integral of $f(x,y)$
+# over the disk should be $\pi/2$ by harmonicity. The coefficient of $P_{0,0}$
+# multiplied by `√π` is:
 U[1, 1]*sqrt(π)
 
-# Using an orthonormal basis, the integral of [f(x,y)]^2 over the disk is
+# Using an orthonormal basis, the integral of $[f(x,y)]^2$ over the disk is
 # approximately the square of the 2-norm of the coefficients:
-norm(U)^2
+norm(U)^2, π/(2*sqrt(2))*log1p(sqrt(2))
diff --git a/examples/halfrange.jl b/examples/halfrange.jl
new file mode 100644
index 00000000..e9de076a
--- /dev/null
+++ b/examples/halfrange.jl
@@ -0,0 +1,77 @@
+# # Half-range Chebyshev polynomials
+# In [this paper](https://doi.org/10.1137/090752456), [Daan Huybrechs](https://github.com/daanhb) introduced the so-called half-range Chebyshev polynomials
+# as the semi-classical orthogonal polynomials with respect to the inner product:
+# ```math
+# \langle f, g \rangle = \int_0^1 f(x) g(x)\frac{{\rm d} x}{\sqrt{1-x^2}}.
+# ```
+# By the variable transformation $y = 2x-1$, the resulting polynomials can be related to
+# orthogonal polynomials on $(-1,1)$ with the Jacobi weight $(1-y)^{-\frac{1}{2}}$ modified by the weight $(3+y)^{-\frac{1}{2}}$.
+#
+# We shall use the fact that:
+# ```math
+# \frac{1}{\sqrt{3+y}} = \sqrt{\frac{2}{3+\sqrt{8}}}\sum_{n=0}^\infty P_n(y) \left(\frac{-1}{3+\sqrt{8}}\right)^n,
+# ```
+# and results from [this paper](https://arxiv.org/abs/2302.08448) to consider the half-range Chebyshev polynomials as
+# modifications of the Jacobi polynomials $P_n^{(-\frac{1}{2},0)}(y)$.
+
+using FastTransforms, LinearAlgebra, Plots, LaTeXStrings
+const GENFIGS = joinpath(pkgdir(FastTransforms), "docs/src/generated")
+!isdir(GENFIGS) && mkdir(GENFIGS)
+plotlyjs()
+
+# We truncate the generating function to ensure a relative error less than `eps()` in the uniform norm on $(-1,1)$:
+z = -1/(3+sqrt(8))
+K = sqrt(-2z)
+N = ceil(Int, log(abs(z), eps()/2*(1-abs(z))/K) - 1)
+d = K .* z .^(0:N)
+
+# Then, we convert this representation to the expansion in Jacobi polynomials $P_n^{(-\frac{1}{2}, 0)}(y)$:
+u = jac2jac(d, 0.0, 0.0, -0.5, 0.0; norm1 = false, norm2 = true)
+
+# Our working polynomial degree will be:
+n = 100
+
+# We compute the connection coefficients between the modified orthogonal polynomials and the Jacobi polynomials:
+P = plan_modifiedjac2jac(Float64, n+1, -0.5, 0.0, u)
+
+# We store the connection to first kind Chebyshev polynomials:
+P1 = plan_jac2cheb(Float64, n+1, -0.5, 0.0; normjac = true)
+
+# We compute the Chebyshev series for the degree-$k\le n$ modified polynomial and its values at the Chebyshev points:
+q = k -> lmul!(P1, lmul!(P, [zeros(k); 1.0; zeros(n-k)]))
+qvals = k-> ichebyshevtransform(q(k))
+
+# With the symmetric Jacobi matrix for $P_n^{(-\frac{1}{2}, 0)}(y)$ and the modified plan, we may compute the modified Jacobi matrix and the corresponding roots (as eigenvalues):
+XP = SymTridiagonal([-inv((4n-1)*(4n-5)) for n in 1:n+1], [4n*(2n-1)/(4n-1)/sqrt((4n-3)*(4n+1)) for n in 1:n])
+XQ = FastTransforms.modified_jacobi_matrix(P, XP)
+SymTridiagonal(XQ.dv[1:10], XQ.ev[1:9])
+
+# And we plot:
+x = (chebyshevpoints(Float64, n+1, Val(1)) .+ 1 ) ./ 2
+p = plot(x, qvals(0); linewidth=2.0, legend = false, xlim=(0,1), xlabel=L"x",
+         ylabel=L"T^h_n(x)", title="Half-Range Chebyshev Polynomials and Their Roots",
+         extra_plot_kwargs = KW(:include_mathjax => "cdn"))
+for k in 1:10
+    λ = (eigvals(SymTridiagonal(XQ.dv[1:k], XQ.ev[1:k-1])) .+ 1) ./ 2
+    plot!(x, qvals(k); linewidth=2.0, color=palette(:default)[k+1])
+    scatter!(λ, zero(λ); markersize=2.5, color=palette(:default)[k+1])
+end
+p
+savefig(joinpath(GENFIGS, "halfrange.html"))
+###```@raw html
+###<object type="text/html" data="../halfrange.html" style="width:100%;height:400px;"></object>
+###```
+
+# By [Theorem 2.20](https://arxiv.org/abs/2302.08448) it turns out that the *derivatives* of the half-range Chebyshev polynomials are a linear combination of at most two polynomials orthogonal with respect to $\sqrt{(3+y)(1-y)}(1+y)$ on $(-1,1)$. This fact enables us to compute the banded differentiation matrix:
+v̂ = 3*[u; 0]+XP[1:N+2, 1:N+1]*u
+v = jac2jac(v̂, -0.5, 0.0, 0.5, 1.0; norm1 = true, norm2 = true)
+function threshold!(A::AbstractArray, ϵ)
+    for i in eachindex(A)
+        if abs(A[i]) < ϵ A[i] = 0 end
+    end
+    A
+end
+P′ = plan_modifiedjac2jac(Float64, n+1, 0.5, 1.0, v)
+DP = UpperTriangular(diagm(1=>[sqrt(n*(n+1/2)) for n in 1:n])) # The classical differentiation matrix representing 𝒟 P^{(-1/2,0)}(y) = P^{(1/2,1)}(y) D_P.
+DQ = UpperTriangular(threshold!(P′\(DP*(P*I)), 100eps())) # The semi-classical differentiation matrix representing 𝒟 Q(y) = Q̂(y) D_Q.
+UpperTriangular(DQ[1:10,1:10])
diff --git a/examples/nonlocaldiffusion.jl b/examples/nonlocaldiffusion.jl
index 102e97bf..0399a110 100644
--- a/examples/nonlocaldiffusion.jl
+++ b/examples/nonlocaldiffusion.jl
@@ -1,3 +1,39 @@
+# # Nonlocal diffusion on $\mathbb{S}^2$
+# This example calculates the spectrum of the nonlocal diffusion operator:
+# ```math
+# \mathcal{L}_\delta u = \int_{\mathbb{S}^2} \rho_\delta(|\mathbf{x}-\mathbf{y}|)\left[u(\mathbf{x}) - u(\mathbf{y})\right] \,\mathrm{d}\Omega(\mathbf{y}),
+# ```
+# defined in Eq. (2) of
+#
+# R. M. Slevinsky, H. Montanelli, and Q. Du, [A spectral method for nonlocal diffusion operators on the sphere](https://doi.org/10.1016/j.jcp.2018.06.024), *J. Comp. Phys.*, **372**:893--911, 2018.
+#
+# In the above, $0<\delta<2$, $-1<\alpha<1$, and the kernel:
+# ```math
+# \rho_\delta(|\mathbf{x}-\mathbf{y}|) = \frac{4(1+\alpha)}{\pi \delta^{2+2\alpha}} \frac{\chi_{[0,\delta]}(|\mathbf{x}-\mathbf{y}|)}{|\mathbf{x}-\mathbf{y}|^{2-2\alpha}},
+# ```
+# where $\chi_I(\cdot)$ is the indicator function on the set $I$.
+#
+# This nonlocal operator is diagonalized by spherical harmonics:
+# ```math
+# \mathcal{L}_\delta Y_\ell^m(\mathbf{x}) = \lambda_\ell(\alpha, \delta) Y_\ell^m(\mathbf{x}),
+# ```
+# and its eigenfunctions are given by the generalized Funk--Hecke formula:
+# ```math
+# \lambda_\ell(\alpha, \delta) = \frac{(1+\alpha) 2^{2+\alpha}}{\delta^{2+2\alpha}}\int_{1-\delta^2/2}^1 \left[P_\ell(t)-1\right] (1-t)^{\alpha-1} \,\mathrm{d} t.
+# ```
+# In the paper, the authors use Clenshaw--Curtis quadrature and asymptotic evaluation of Legendre polynomials to achieve $\mathcal{O}(n^2\log n)$ complexity for the evaluation of the first $n$ eigenvalues. With a change of basis, this complexity can be reduced to $\mathcal{O}(n\log n)$.
+#
+# First, we represent:
+# ```math
+# P_n(t) - 1 = \sum_{j=0}^{n-1} \left[P_{j+1}(t) - P_j(t)\right] = -\sum_{j=0}^{n-1} (1-t) P_j^{(1,0)}(t).
+# ```
+# Then, we represent $P_j^{(1,0)}(t)$ with Jacobi polynomials $P_i^{(\alpha,0)}(t)$ and we integrate using [DLMF 18.9.16](https://dlmf.nist.gov/18.9.16):
+# ```math
+# \int_x^1 P_i^{(\alpha,0)}(t)(1-t)^\alpha\,\mathrm{d}t = \left\{ \begin{array}{cc} \frac{(1-x)^{\alpha+1}}{\alpha+1} & \mathrm{for~}i=0,\\ \frac{1}{2i}(1-x)^{\alpha+1}(1+x)P_{i-1}^{(\alpha+1,1)}(x), & \mathrm{for~}i>0.\end{array}\right.
+# ```
+# The code below implements this algorithm, making use of the Jacobi--Jacobi transform `plan_jac2jac`.
+# For numerical stability, the conversion from Jacobi polynomials $P_j^{(1,0)}(t)$ to $P_i^{(\alpha,0)}(t)$ is divided into conversion from $P_j^{(1,0)}(t)$ to $P_k^{(0,0)}(t)$, before conversion from $P_k^{(0,0)}(t)$ to $P_i^{(\alpha,0)}(t)$.
+
 using FastTransforms, LinearAlgebra
 
 function oprec!(n::Integer, v::AbstractVector, alpha::Real, delta2::Real)
@@ -13,19 +49,6 @@ function oprec!(n::Integer, v::AbstractVector, alpha::Real, delta2::Real)
     return v
 end
 
-"""
-This example calculates the spectrum of the nonlocal diffusion operator:
-
-```math
-ℒ_δ u = ∫_𝕊² ρ_δ(|𝐱-𝐲|)[u(𝐱) - u(𝐲)] dΩ(𝐲),
-```
-
-defined in Eq. (2) of
-
-    R. M. Slevinsky, H. Montanelli, and Q. Du, A spectral method for nonlocal diffusion operators on the sphere, J. Comp. Phys., 372:893--911, 2018.
-
-available at https://doi.org/10.1016/j.jcp.2018.06.024
-"""
 function evaluate_lambda(n::Integer, alpha::T, delta::T) where T
     delta2 = delta*delta
     scl = (1+alpha)*(2-delta2/2)
@@ -47,7 +70,7 @@ function evaluate_lambda(n::Integer, alpha::T, delta::T) where T
 
     p = plan_jac2jac(T, n-1, zero(T), zero(T), alpha, zero(T))
 
-    lambda[2:end] .= p'lambda[2:end]
+    lmul!(p', view(lambda, 2:n))
 
     for i = 2:n-1
         lambda[i+1] = ((2i-1)*lambda[i+1] + (i-1)*lambda[i])/i
@@ -60,7 +83,11 @@ function evaluate_lambda(n::Integer, alpha::T, delta::T) where T
     return lambda
 end
 
-lambda = evaluate_lambda(1024, -0.5, 1.0)
-lambdabf = evaluate_lambda(1024, parse(BigFloat, "-0.5"), parse(BigFloat, "1.0"))
+# The spectrum in `Float64`:
+lambda = evaluate_lambda(10, -0.5, 1.0)
+
+# The spectrum in `BigFloat`:
+lambdabf = evaluate_lambda(10, parse(BigFloat, "-0.5"), parse(BigFloat, "1.0"))
 
+# The $\infty$-norm relative error:
 norm(lambda-lambdabf, Inf)/norm(lambda, Inf)
diff --git a/examples/padua.jl b/examples/padua.jl
index bdb9fdab..91655ce4 100644
--- a/examples/padua.jl
+++ b/examples/padua.jl
@@ -1,16 +1,20 @@
-#############
+# # Padua transform
 # This demonstrates the Padua transform and inverse transform,
 # explaining precisely the normalization and points
-#############
 
 using FastTransforms
 
+# We define the Padua points and extract Cartesian components:
 N = 15
 pts = paduapoints(N)
-x = pts[:,1]; y = pts[:,2]
+x = pts[:,1]
+y = pts[:,2];
 
+# We take the Padua transform of the function:
 f = (x,y) -> exp(x + cos(y))
-f̌ = paduatransform(f.(x , y))
+f̌ = paduatransform(f.(x , y));
+
+# and use the coefficients to create an approximation to the function $f$:
 f̃ = (x,y) -> begin
     j = 1
     ret = 0.0
@@ -21,6 +25,8 @@ f̃ = (x,y) -> begin
     ret
 end
 
+# At a particular point, is the function well-approximated?
 f̃(0.1,0.2) ≈ f(0.1,0.2)
 
+# Does the inverse transform bring us back to the grid?
 ipaduatransform(f̌) ≈ f̃.(x,y)
diff --git a/examples/sphere.jl b/examples/sphere.jl
index 3a1a0953..a28a36a2 100644
--- a/examples/sphere.jl
+++ b/examples/sphere.jl
@@ -1,28 +1,29 @@
-#############
+# # Spherical harmonic addition theorem
 # This example confirms numerically that
-#
-#   [P₄(z⋅y) - P₄(x⋅y)]/(z⋅y - x⋅y),
-#
-# is actually a degree-3 polynomial on 𝕊², where P₄ is the degree-4
-# Legendre polynomial, and x,y,z ∈ 𝕊².
-# To verify, we sample the function on a 5×9 equiangular grid
+# ```math
+# f(z) = \frac{P_n(z\cdot y) - P_n(x\cdot y)}{z\cdot y - x\cdot y},
+# ```
+# is actually a degree-$(n-1)$ polynomial on $\mathbb{S}^2$, where $P_n$ is the degree-$n$
+# Legendre polynomial, and $x,y,z \in \mathbb{S}^2$.
+# To verify, we sample the function on a $N\times M$ equiangular grid
 # defined by:
-#
-#   θₙ = (n+1/2)π/N, for 0 ≤ n < N, and
-#
-#   φₘ = 2π m/M, for 0 ≤ m < M;
-#
+# ```math
+# \begin{aligned}
+# \theta_n & = (n+\tfrac{1}{2})\pi/N,\quad{\rm for}\quad 0\le n < N,\quad{\rm and}\\
+# \varphi_m & = 2\pi m/M,\quad{\rm for}\quad 0\le m < M;
+# \end{aligned}
+# ```
 # we convert the function samples to Fourier coefficients using
 # `plan_sph_analysis`; and finally, we transform
 # the Fourier coefficients to spherical harmonic coefficients using
 # `plan_sph2fourier`.
 #
 # In the basis of spherical harmonics, it is plain to see the
-# addition theorem in action, since P₄(x⋅y) should only consist of
-# exact-degree-4 harmonics.
+# addition theorem in action, since $P_n(x\cdot y)$ should only consist of
+# exact-degree-$n$ harmonics.
 #
-# For the storage pattern of the arrays, please consult the documentation.
-#############
+# For the storage pattern of the arrays, please consult the
+# [documentation](https://MikaelSlevinsky.github.io/FastTransforms).
 
 function threshold!(A::AbstractArray, ϵ)
     for i in eachindex(A)
@@ -31,65 +32,108 @@ function threshold!(A::AbstractArray, ϵ)
     A
 end
 
-using FastTransforms, LinearAlgebra
+using FastTransforms, LinearAlgebra, Plots
+const GENFIGS = joinpath(pkgdir(FastTransforms), "docs/src/generated")
+!isdir(GENFIGS) && mkdir(GENFIGS)
+plotlyjs()
 
-# The colatitudinal grid (mod π):
-N = 5
+# The colatitudinal grid (mod $\pi$):
+N = 15
 θ = (0.5:N-0.5)/N
 
-# The longitudinal grid (mod π):
+# The longitudinal grid (mod $\pi$):
 M = 2*N-1
 φ = (0:M-1)*2/M
 
-# Arbitrarily, we place x at the North pole:
+# Arbitrarily, we place $x$ at the North pole:
 x = [0,0,1]
 
 # Another vector is completely free:
 y = normalize([.123,.456,.789])
 
-# Thus z ∈ 𝕊² is our variable vector, parameterized in spherical coordinates:
+# Thus $z \in \mathbb{S}^2$ is our variable vector, parameterized in spherical coordinates:
 z = (θ,φ) -> [sinpi(θ)*cospi(φ), sinpi(θ)*sinpi(φ), cospi(θ)]
 
-# The degree-4 Legendre polynomial is:
-P4 = x -> (35*x^4-30*x^2+3)/8
-
-# On the tensor product grid, our function samples are:
-F = [(P4(z(θ,φ)⋅y) - P4(x⋅y))/(z(θ,φ)⋅y - x⋅y) for θ in θ, φ in φ]
-
+# On the tensor product grid, the Legendre polynomial $P_n(z\cdot y)$ is:
+A = [(2k+1)/(k+1) for k in 0:N-1]
+B = zeros(N)
+C = [k/(k+1) for k in 0:N]
+c = zeros(N); c[N] = 1
+pts = vec([z(θ, φ)⋅y for θ in θ, φ in φ])
+phi0 = ones(N*M)
+F = reshape(FastTransforms.clenshaw!(zeros(N*M), c, A, B, C, pts, phi0), N, M)
+
+# We superpose a surface plot of $f$ on top of the grid:
+X = [sinpi(θ)*cospi(φ) for θ in θ, φ in φ]
+Y = [sinpi(θ)*sinpi(φ) for θ in θ, φ in φ]
+Z = [cospi(θ) for θ in θ, φ in φ]
+scatter3d(vec(X), vec(Y), vec(Z); markersize=1.25, markercolor=:violetred)
+surface!(X, Y, Z; surfacecolor=F, legend=false, xlabel="x", ylabel="y", zlabel="f")
+savefig(joinpath(GENFIGS, "sphere1.html"))
+###```@raw html
+###<object type="text/html" data="../sphere1.html" style="width:100%;height:400px;"></object>
+###```
+
+# We show the cut in the surface to help illustrate the definition of the grid.
+# In particular, we do not sample the poles.
+#
+# We precompute a spherical harmonic--Fourier plan:
 P = plan_sph2fourier(F)
+
+# And an FFTW Fourier analysis plan on $\mathbb{S}^2$:
 PA = plan_sph_analysis(F)
 
-# Its spherical harmonic coefficients demonstrate that it is degree-3:
+# Its spherical harmonic coefficients demonstrate that it is exact-degree-$n$:
 V = PA*F
-U3 = threshold!(P\V, 400*eps())
+U = threshold!(P\V, 400*eps())
+
+# The $L^2(\mathbb{S}^2)$ norm of the function is:
+nrm1 = norm(U)
 
-# Similarly, on the tensor product grid, the Legendre polynomial P₄(z⋅y) is:
-F = [P4(z(θ,φ)⋅y) for θ in θ, φ in φ]
+# Similarly, on the tensor product grid, our function samples are:
+Pnxy = FastTransforms.clenshaw!([0.0], c, A, B, C, [x⋅y], [1.0])[1]
+F = [(F[n, m] - Pnxy)/(z(θ[n], φ[m])⋅y - x⋅y) for n in 1:N, m in 1:M]
 
-# Its spherical harmonic coefficients demonstrate that it is exact-degree-4:
+# We superpose a surface plot of $f$ on top of the grid:
+scatter3d(vec(X), vec(Y), vec(Z); markersize=1.25, markercolor=:violetred)
+surface!(X, Y, Z; surfacecolor=F, legend=false, xlabel="x", ylabel="y", zlabel="f")
+savefig(joinpath(GENFIGS, "sphere2.html"))
+###```@raw html
+###<object type="text/html" data="../sphere2.html" style="width:100%;height:400px;"></object>
+###```
+
+# Its spherical harmonic coefficients demonstrate that it is degree-$(n-1)$:
 V = PA*F
-U4 = threshold!(P\V, 3*eps())
+U = threshold!(P\V, 400*eps())
 
-nrm1 = norm(U4);
+# Finally, the Legendre polynomial $P_n(z\cdot x)$ is aligned with the grid:
+pts = vec([z(θ, φ)⋅x for θ in θ, φ in φ])
+F = reshape(FastTransforms.clenshaw!(zeros(N*M), c, A, B, C, pts, phi0), N, M)
 
-# Finally, the Legendre polynomial P₄(z⋅x) is aligned with the grid:
-F = [P4(z(θ,φ)⋅x) for θ in θ, φ in φ]
+# We superpose a surface plot of $f$ on top of the grid:
+scatter3d(vec(X), vec(Y), vec(Z); markersize=1.25, markercolor=:violetred)
+surface!(X, Y, Z; surfacecolor=F, legend=false, xlabel="x", ylabel="y", zlabel="f")
+savefig(joinpath(GENFIGS, "sphere3.html"))
+###```@raw html
+###<object type="text/html" data="../sphere3.html" style="width:100%;height:400px;"></object>
+###```
 
 # It only has one nonnegligible spherical harmonic coefficient.
 # Can you spot it?
 V = PA*F
-U4 = threshold!(P\V, 3*eps())
+U = threshold!(P\V, 400*eps())
 
-# That nonnegligible coefficient should be approximately √(2π/(4+1/2)),
-# since the convention in this library is to orthonormalize.
+# That nonnegligible coefficient should be
+ret = eval("√(2π/($(N-1)+1/2))")
 
-nrm2 = norm(U4);
+# which is approximately
+eval(Meta.parse(ret))
 
-# Note that the integrals of both functions P₄(z⋅y) and P₄(z⋅x) and their
-# L²(𝕊²) norms are the same because of rotational invariance. The integral of
+# since the convention in this library is to orthonormalize.
+nrm2 = norm(U)
+
+# Note that the integrals of both functions $P_n(z\cdot y)$ and $P_n(z\cdot x)$ and their
+# $L^2(\mathbb{S}^2)$ norms are the same because of rotational invariance. The integral of
 # either is perhaps not interesting as it is mathematically zero, but the norms
 # of either should be approximately the same.
-
-@show nrm1
-@show nrm2
-@show nrm1 ≈ nrm2
+nrm1 ≈ nrm2
diff --git a/examples/sphericalisometries.jl b/examples/sphericalisometries.jl
new file mode 100644
index 00000000..b3ed7f6b
--- /dev/null
+++ b/examples/sphericalisometries.jl
@@ -0,0 +1,124 @@
+function threshold!(A::AbstractArray, ϵ)
+    for i in eachindex(A)
+        if abs(A[i]) < ϵ A[i] = 0 end
+    end
+    A
+end
+
+using FastTransforms, LinearAlgebra, Random, Test
+
+# The colatitudinal grid (mod π):
+N = 10
+θ = (0.5:N-0.5)/N
+
+# The longitudinal grid (mod π):
+M = 2*N-1
+φ = (0:M-1)*2/M
+
+x = [cospi(φ)*sinpi(θ) for θ in θ, φ in φ]
+y = [sinpi(φ)*sinpi(θ) for θ in θ, φ in φ]
+z = [cospi(θ) for θ in θ, φ in φ]
+
+P = plan_sph2fourier(Float64, N)
+PA = plan_sph_analysis(Float64, N, M)
+J = FastTransforms.plan_sph_isometry(Float64, N)
+
+
+f = (x, y, z) -> x^2+y^4+x^2*y*z^3-x*y*z^2
+
+
+F = f.(x, y, z)
+V = PA*F
+U = threshold!(P\V, 100eps())
+FastTransforms.execute_sph_yz_axis_exchange!(J, U)
+FR = f.(x, -z, -y)
+VR = PA*FR
+UR = threshold!(P\VR, 100eps())
+@test U ≈ UR
+norm(U-UR)
+
+
+α, β, γ = 0.123, 0.456, 0.789
+
+# Isometry built up from ZYZR
+A = [cos(α) -sin(α) 0; sin(α) cos(α) 0; 0 0 1]
+B = [cos(β) 0 -sin(β); 0 1 0; sin(β) 0 cos(β)]
+C = [cos(γ) -sin(γ) 0; sin(γ) cos(γ) 0; 0 0 1]
+R = diagm([1, 1, 1.0])
+Q = A*B*C*R
+
+# Transform the sampling grid. Note that `Q` is transposed here.
+u = Q[1,1]*x + Q[2,1]*y + Q[3,1]*z
+v = Q[1,2]*x + Q[2,2]*y + Q[3,2]*z
+w = Q[1,3]*x + Q[2,3]*y + Q[3,3]*z
+
+F = f.(x, y, z)
+V = PA*F
+U = threshold!(P\V, 100eps())
+FastTransforms.execute_sph_rotation!(J, α, β, γ, U)
+FR = f.(u, v, w)
+VR = PA*FR
+UR = threshold!(P\VR, 100eps())
+@test U ≈ UR
+norm(U-UR)
+
+
+F = f.(x, y, z)
+V = PA*F
+U = threshold!(P\V, 100eps())
+FastTransforms.execute_sph_polar_reflection!(U)
+FR = f.(x, y, -z)
+VR = PA*FR
+UR = threshold!(P\VR, 100eps())
+@test U ≈ UR
+norm(U-UR)
+
+
+# Isometry built up from planar reflection
+W = [0.123, 0.456, 0.789]
+H = w -> I - 2/(w'w)*w*w'
+Q = H(W)
+
+# Transform the sampling grid. Note that `Q` is transposed here.
+u = Q[1,1]*x + Q[2,1]*y + Q[3,1]*z
+v = Q[1,2]*x + Q[2,2]*y + Q[3,2]*z
+w = Q[1,3]*x + Q[2,3]*y + Q[3,3]*z
+
+F = f.(x, y, z)
+V = PA*F
+U = threshold!(P\V, 100eps())
+FastTransforms.execute_sph_reflection!(J, W, U)
+FR = f.(u, v, w)
+VR = PA*FR
+UR = threshold!(P\VR, 100eps())
+@test U ≈ UR
+norm(U-UR)
+
+F = f.(x, y, z)
+V = PA*F
+U = threshold!(P\V, 100eps())
+FastTransforms.execute_sph_reflection!(J, (W[1], W[2], W[3]), U)
+FR = f.(u, v, w)
+VR = PA*FR
+UR = threshold!(P\VR, 100eps())
+@test U ≈ UR
+norm(U-UR)
+
+# Random orthogonal transformation
+Random.seed!(0)
+Q = qr(rand(3, 3)).Q
+
+# Transform the sampling grid, note that `Q` is transposed here.
+u = Q[1,1]*x + Q[2,1]*y + Q[3,1]*z
+v = Q[1,2]*x + Q[2,2]*y + Q[3,2]*z
+w = Q[1,3]*x + Q[2,3]*y + Q[3,3]*z
+
+F = f.(x, y, z)
+V = PA*F
+U = threshold!(P\V, 100eps())
+FastTransforms.execute_sph_orthogonal_transformation!(J, Q, U)
+FR = f.(u, v, w)
+VR = PA*FR
+UR = threshold!(P\VR, 100eps())
+@test U ≈ UR
+norm(U-UR)
diff --git a/examples/spinweighted.jl b/examples/spinweighted.jl
new file mode 100644
index 00000000..6e8e8656
--- /dev/null
+++ b/examples/spinweighted.jl
@@ -0,0 +1,65 @@
+# # Spin-weighted spherical harmonics
+# This example plays with analysis of:
+# ```math
+# f(r) = e^{{\rm i} k\cdot r},
+# ```
+# for some $k\in\mathbb{R}^3$ and where $r\in\mathbb{S}^2$, using spin-$0$ spherical harmonics.
+#
+# It applies ð, the spin-raising operator,
+# both on the spin-$0$ coefficients as well as the original function,
+# followed by a spin-$1$ analysis to compare coefficients.
+#
+# For the storage pattern of the arrays, please consult the
+# [documentation](https://MikaelSlevinsky.github.io/FastTransforms).
+
+using FastTransforms, LinearAlgebra
+
+# The colatitudinal grid (mod $\pi$):
+N = 10
+θ = (0.5:N-0.5)/N
+
+# The longitudinal grid (mod $\pi$):
+M = 2*N-1
+φ = (0:M-1)*2/M
+
+# Our choice of $k$ and angular parametrization of $r$:
+k = [2/7, 3/7, 6/7]
+r = (θ,φ) -> [sinpi(θ)*cospi(φ), sinpi(θ)*sinpi(φ), cospi(θ)]
+
+# On the tensor product grid, our function samples are:
+F = [exp(im*(k⋅r(θ,φ))) for θ in θ, φ in φ]
+
+# We precompute a spin-$0$ spherical harmonic--Fourier plan:
+P = plan_spinsph2fourier(F, 0)
+
+# And an FFTW Fourier analysis plan on $\mathbb{S}^2$:
+PA = plan_spinsph_analysis(F, 0)
+
+# Its spin-$0$ spherical harmonic coefficients are:
+U⁰ = P\(PA*F)
+
+# We can check its $L^2(\mathbb{S}^2)$ norm against an exact result:
+norm(U⁰) ≈ sqrt(4π)
+
+# Spin can be incremented by applying ð, either on the spin-$0$ coefficients:
+U¹c = zero(U⁰)
+for n in 1:N-1
+    U¹c[n, 1] = sqrt(n*(n+1))*U⁰[n+1, 1]
+end
+for m in 1:M÷2
+    for n in 0:N-1
+        U¹c[n+1, 2m] = -sqrt((n+m)*(n+m+1))*U⁰[n+1, 2m]
+        U¹c[n+1, 2m+1] = sqrt((n+m)*(n+m+1))*U⁰[n+1, 2m+1]
+    end
+end
+
+# or on the original function through analysis with spin-$1$ spherical harmonics:
+F = [-(k[1]*(im*cospi(θ)*cospi(φ) + sinpi(φ)) + k[2]*(im*cospi(θ)*sinpi(φ)-cospi(φ)) - im*k[3]*sinpi(θ))*exp(im*(k⋅r(θ,φ))) for θ in θ, φ in φ]
+
+# We change plans with spin-$1$ now and reanalyze:
+P = plan_spinsph2fourier(F, 1)
+PA = plan_spinsph_analysis(F, 1)
+U¹s = P\(PA*F)
+
+# Finally, we check $L^2(\mathbb{S}^2)$ norms against another exact result:
+norm(U¹c) ≈ norm(U¹s) ≈ sqrt(8π/3*(k⋅k))
diff --git a/examples/subspaceangles.jl b/examples/subspaceangles.jl
new file mode 100644
index 00000000..b6bf5ef0
--- /dev/null
+++ b/examples/subspaceangles.jl
@@ -0,0 +1,29 @@
+# # Subspace angles
+# This example considers the angles between neighbouring Laguerre polynomials with a perturbed measure:
+# ```math
+# \cos\theta_n = \frac{\langle L_n, L_{n+k}\rangle}{\|L_n\|_2 \|L_{n+k}\|_2},\quad{\rm for}\quad 0\le n < N-k,
+# ```
+# where the inner product is defined by $\langle f, g\rangle = \int_0^\infty f(x) g(x) x^\beta e^{-x}{\rm\,d}x$.
+#
+# We do so by connecting Laguerre polynomials to the normalized generalized Laguerre polynomials associated with the perturbed measure. It follows by the inner product of the connection coefficients that:
+# ```math
+# \cos\theta_n = \frac{(V^\top V)_{n, n+k}}{\sqrt{(V^\top V)_{n, n}(V^\top V)_{n+k, n+k}}}.
+# ```
+#
+using FastTransforms, LinearAlgebra
+
+# The neighbouring index `k` and the maximum degree `N-1`:
+k, N = 1, 11
+
+# The Laguerre connection parameters:
+α, β = 0.0, 0.125
+
+# We precompute a Laguerre--Laguerre plan:
+P = plan_lag2lag(Float64, N, α, β; norm2=true)
+
+# We apply the plan to the identity, followed by the adjoint plan:
+VtV = parent(P*I)
+lmul!(P', VtV)
+
+# From this matrix, the angles are recovered from:
+θ = [acos(VtV[n, n+k]/sqrt(VtV[n, n]*VtV[n+k, n+k])) for n in 1:N-k]
diff --git a/examples/triangle.jl b/examples/triangle.jl
index e0879fc3..29ebb713 100644
--- a/examples/triangle.jl
+++ b/examples/triangle.jl
@@ -1,67 +1,92 @@
-#############
+# # Calculus on the reference triangle
 # In this example, we sample a bivariate function:
-#
-#   f(x,y) = 1/(1+x^2+y^2),
-#
-# on the reference triangle with vertices (0,0), (0,1), and (1,0) and analyze it
+# ```math
+# f(x,y) = \frac{1}{1+x^2+y^2},
+# ```
+# on the reference triangle with vertices $(0,0)$, $(0,1)$, and $(1,0)$ and analyze it
 # in a Proriol series. Then, we find Proriol series for each component of its
 # gradient by term-by-term differentiation of our expansion, and we compare them
 # with the true Proriol series by sampling an exact expression for the gradient.
 #
-# We analyze f(x,y) on an N×M mapped tensor product grid defined by:
-#
-#   x = (1+u)/2, and y = (1-u)*(1+v)/4, where:
-#
-#   uₙ = cos[(n+1/2)π/N], for 0 ≤ n < N, and
-#
-#   vₘ = cos[(m+1/2)π/M], for 0 ≤ m < M;
-#
+# We analyze $f(x,y)$ on an $N\times M$ mapped tensor product grid defined by:
+# ```math
+# \begin{aligned}
+# x & = (1+u)/2,\quad{\rm and}\quad y = (1-u)(1+v)/4,\quad {\rm where:}\\
+# u_n & = \cos\left[(n+\tfrac{1}{2})\pi/N\right],\quad{\rm for}\quad 0\le n < N,\quad{\rm and}\\
+# v_m & = \cos\left[(m+\tfrac{1}{2})\pi/M\right],\quad{\rm for}\quad 0\le m < M;
+# \end{aligned}
+# ```
 # we convert the function samples to mapped Chebyshev² coefficients using
 # `plan_tri_analysis`; and finally, we transform the mapped Chebyshev²
 # coefficients to Proriol coefficients using `plan_tri2cheb`.
 #
-# For the storage pattern of the arrays, please consult the documentation.
-#############
+# For the storage pattern of the arrays, please consult the
+# [documentation](https://MikaelSlevinsky.github.io/FastTransforms).
 
-using FastTransforms, LinearAlgebra
+using FastTransforms, LinearAlgebra, Plots
+const GENFIGS = joinpath(pkgdir(FastTransforms), "docs/src/generated")
+!isdir(GENFIGS) && mkdir(GENFIGS)
+plotlyjs()
 
+# Our function $f$ and the Cartesian components of its gradient:
 f = (x,y) -> 1/(1+x^2+y^2)
 fx = (x,y) -> -2x/(1+x^2+y^2)^2
 fy = (x,y) -> -2y/(1+x^2+y^2)^2
 
-N = 10
+# The polynomial degree:
+N = 15
 M = N
 
+# The parameters of the Proriol series:
 α, β, γ = 0, 0, 0
 
+# The $u$ grid:
 u = [sinpi((N-2n-1)/(2N)) for n in 0:N-1]
+
+# And the $v$ grid:
 v = [sinpi((M-2m-1)/(2M)) for m in 0:M-1]
 
-# Instead of using the u, v grid, we use one with more accuracy near the origin.
+# Instead of using the $u\times v$ grid, we use one with more accuracy near the origin.
+# Defining $x$ by:
 x = [sinpi((2N-2n-1)/(4N))^2 for n in 0:N-1]
+
+# And $w$ by:
 w = [sinpi((2M-2m-1)/(4M))^2 for m in 0:M-1]
 
-(1 .+ u)./2 ≈ x
-(1 .- u).*(1 .+ v')/4 ≈ reverse(x).*w'
+# We see how the two grids are related:
+((1 .+ u)./2 ≈ x) * ((1 .- u).*(1 .+ v')/4 ≈ reverse(x).*w')
 
 # On the mapped tensor product grid, our function samples are:
 F = [f(x[n+1], x[N-n]*w[m+1]) for n in 0:N-1, m in 0:M-1]
 
+# We superpose a surface plot of $f$ on top of the grid:
+X = [x for x in x, w in w]
+Y = [x[N-n]*w[m+1] for n in 0:N-1, m in 0:M-1]
+scatter3d(vec(X), vec(Y), vec(0F); markersize=0.75, markercolor=:blue)
+surface!(X, Y, F; legend=false, xlabel="x", ylabel="y", zlabel="f")
+savefig(joinpath(GENFIGS, "proriol.html"))
+###```@raw html
+###<object type="text/html" data="../proriol.html" style="width:100%;height:400px;"></object>
+###```
+
+# We precompute a Proriol--Chebyshev² plan:
 P = plan_tri2cheb(F, α, β, γ)
+
+# And an FFTW Chebyshev² plan on the triangle:
 PA = plan_tri_analysis(F)
 
-# Its Proriol-(α,β,γ) coefficients are:
+# Its Proriol-$(α,β,γ)$ coefficients are:
 U = P\(PA*F)
 
 # Similarly, our function's gradient samples are:
 Fx = [fx(x[n+1], x[N-n]*w[m+1]) for n in 0:N-1, m in 0:M-1]
+
+# and:
 Fy = [fy(x[n+1], x[N-n]*w[m+1]) for n in 0:N-1, m in 0:M-1]
 
-# For the partial derivative with respect to x, Olver et al.
+# For the partial derivative with respect to $x$, [Olver et al.](https://doi.org/10.1137/19M1245888)
 # derive simple expressions for the representation of this component
-# using a Proriol-(α+1,β,γ+1) series. For the partial derivative with respect
-# to y, the analogous formulae result in a Proriol-(α,β+1,γ+1) series.
-# These expressions are adapted from https://arxiv.org/abs/1902.04863.
+# using a Proriol-$(α+1,β,γ+1)$ series.
 Gx = zeros(Float64, N, M)
 for m = 0:M-2
     for n = 0:N-2
@@ -73,6 +98,8 @@ end
 Px = plan_tri2cheb(Fx, α+1, β, γ+1)
 Ux = Px\(PA*Fx)
 
+# For the partial derivative with respect to y, the analogous formulae result
+# in a Proriol-$(α,β+1,γ+1)$ series.
 Gy = zeros(Float64, N, M)
 for m = 0:M-2
     for n = 0:N-2
@@ -82,8 +109,8 @@ end
 Py = plan_tri2cheb(Fy, α, β+1, γ+1)
 Uy = Py\(PA*Fy)
 
-# The 2-norm relative error in differentiating the Proriol series
-# for f(x,y) term-by-term and its sampled gradient is:
+# The $2$-norm relative error in differentiating the Proriol series
+# for $f(x,y)$ term-by-term and its sampled gradient is:
 hypot(norm(Ux-Gx), norm(Uy-Gy))/hypot(norm(Ux), norm(Uy))
 
-# This error can be improved upon by increasing N and M.
+# This error can be improved upon by increasing $N$ and $M$.
diff --git a/src/FastTransforms.jl b/src/FastTransforms.jl
index 56027628..a9324735 100644
--- a/src/FastTransforms.jl
+++ b/src/FastTransforms.jl
@@ -1,93 +1,154 @@
 module FastTransforms
 
-using DSP, FastGaussQuadrature, Libdl, LinearAlgebra, SpecialFunctions, ToeplitzMatrices
-using Reexport
-@reexport using AbstractFFTs
-@reexport using FFTW
+using ArrayLayouts, BandedMatrices, FastGaussQuadrature, FillArrays, LazyArrays, LinearAlgebra,
+      SpecialFunctions, ToeplitzMatrices, RecurrenceRelationships
 
-import Base: unsafe_convert, eltype, ndims, adjoint, transpose, show, *, \,
-             inv, size, view
+using AbstractFFTs
+using FFTW
+using GenericFFT
+
+import Base: convert, unsafe_convert, eltype, ndims, adjoint, transpose, show,
+             *, \, inv, length, size, view, getindex, tail, OneTo
 
 import Base.GMP: Limb
-import Base.MPFR: BigFloat, _BigFloat
 
 import AbstractFFTs: Plan, ScaledPlan,
-                     fft, ifft, bfft, fft!, ifft!, bfft!,
-                     plan_fft, plan_ifft, plan_bfft, plan_fft!, plan_ifft!, plan_bfft!,
-                     rfft, irfft, brfft, plan_rfft, plan_irfft, plan_brfft,
-                     fftshift, ifftshift,
-                     rfft_output_size, brfft_output_size,
-                     plan_inv, normalization
+                     fft, ifft, bfft, fft!, ifft!, bfft!, rfft, irfft, brfft,
+                     plan_fft, plan_ifft, plan_bfft, plan_fft!, plan_ifft!,
+                     plan_bfft!, plan_rfft, plan_irfft, plan_brfft,
+                     fftshift, ifftshift, rfft_output_size, brfft_output_size,
+                     normalization
+
+import ArrayLayouts: rowsupport, colsupport, LayoutMatrix, MemoryLayout, AbstractBandedLayout
+
+import BandedMatrices: bandwidths, BandedLayout
 
 import FFTW: dct, dct!, idct, idct!, plan_dct!, plan_idct!,
              plan_dct, plan_idct, fftwNumber
 
-import DSP: conv
-
 import FastGaussQuadrature: unweightedgausshermite
 
-import LinearAlgebra: mul!, lmul!, ldiv!
+import FillArrays: AbstractFill, getindex_value
+
+import LinearAlgebra: cholesky, issymmetric, isposdef, mul!, lmul!, ldiv!
+
+import GenericFFT: interlace # imported in downstream packages
+
+import RecurrenceRelationships: check_clenshaw_recurrences
+
 
 export leg2cheb, cheb2leg, ultra2ultra, jac2jac,
        lag2lag, jac2ultra, ultra2jac, jac2cheb,
-       cheb2jac, ultra2cheb, cheb2ultra,
-       sph2fourier, sphv2fourier, disk2cxf, tri2cheb, tet2cheb,
-       fourier2sph, fourier2sphv, cxf2disk, cheb2tri, cheb2tet
+       cheb2jac, ultra2cheb, cheb2ultra, associatedjac2jac,
+       modifiedjac2jac, modifiedlag2lag, modifiedherm2herm,
+       sph2fourier, sphv2fourier, disk2cxf, ann2cxf, rectdisk2cheb,
+       tri2cheb, tet2cheb,fourier2sph, fourier2sphv, cxf2disk, cxf2ann,
+       cheb2rectdisk, cheb2tri, cheb2tet
 
 export plan_leg2cheb, plan_cheb2leg, plan_ultra2ultra, plan_jac2jac,
        plan_lag2lag, plan_jac2ultra, plan_ultra2jac, plan_jac2cheb,
-       plan_cheb2jac, plan_ultra2cheb, plan_cheb2ultra,
+       plan_cheb2jac, plan_ultra2cheb, plan_cheb2ultra, plan_associatedjac2jac,
+       plan_modifiedjac2jac, plan_modifiedlag2lag, plan_modifiedherm2herm,
        plan_sph2fourier, plan_sph_synthesis, plan_sph_analysis,
        plan_sphv2fourier, plan_sphv_synthesis, plan_sphv_analysis,
        plan_disk2cxf, plan_disk_synthesis, plan_disk_analysis,
+       plan_ann2cxf, plan_annulus_synthesis, plan_annulus_analysis,
+       plan_rectdisk2cheb, plan_rectdisk_synthesis, plan_rectdisk_analysis,
        plan_tri2cheb, plan_tri_synthesis, plan_tri_analysis,
-       plan_tet2cheb, plan_tet_synthesis, plan_tet_analysis
+       plan_tet2cheb, plan_tet_synthesis, plan_tet_analysis,
+       plan_spinsph2fourier, plan_spinsph_synthesis, plan_spinsph_analysis
+
 
 include("libfasttransforms.jl")
+include("elliptic.jl")
 
-export plan_nufft, plan_nufft1, plan_nufft2, plan_nufft3, plan_inufft1, plan_inufft2
 export nufft, nufft1, nufft2, nufft3, inufft1, inufft2
 
+export plan_nufft, plan_nufft1, plan_nufft2, plan_nufft3,
+       plan_inufft1, plan_inufft2
+
 include("nufft.jl")
 include("inufft.jl")
 
 export paduatransform, ipaduatransform, paduatransform!, ipaduatransform!,
-       paduapoints, plan_paduatransform!, plan_ipaduatransform!
+       paduapoints
+
+export plan_paduatransform!, plan_ipaduatransform!
 
 include("PaduaTransform.jl")
 
-export plan_chebyshevtransform, plan_ichebyshevtransform, plan_chebyshevtransform!, plan_ichebyshevtransform!,
-            chebyshevtransform, ichebyshevtransform, chebyshevpoints,
-            plan_chebyshevutransform, plan_ichebyshevutransform, plan_chebyshevutransform!, plan_ichebyshevutransform!,
-            chebyshevutransform, ichebyshevutransform
+export chebyshevtransform, ichebyshevtransform,
+       chebyshevtransform!, ichebyshevtransform!,
+       chebyshevutransform, ichebyshevutransform,
+       chebyshevutransform!, ichebyshevutransform!, chebyshevpoints
+
+export plan_chebyshevtransform, plan_ichebyshevtransform,
+       plan_chebyshevtransform!, plan_ichebyshevtransform!,
+       plan_chebyshevutransform, plan_ichebyshevutransform,
+       plan_chebyshevutransform!, plan_ichebyshevutransform!
 
 include("chebyshevtransform.jl")
 
-export plan_clenshawcurtis, clenshawcurtisnodes, clenshawcurtisweights
-export plan_fejer1, fejernodes1, fejerweights1,
-       plan_fejer2, fejernodes2, fejerweights2
+export clenshawcurtisnodes, clenshawcurtisweights, fejernodes1, fejerweights1,
+       fejernodes2, fejerweights2
+
+export plan_clenshawcurtis, plan_fejer1, plan_fejer2
 
 include("clenshawcurtis.jl")
 include("fejer.jl")
 
-export weightedhermitetransform, iweightedhermitetransform
+export gaunt
 
-include("hermite.jl")
+include("gaunt.jl")
 
-include("fftBigFloat.jl")
+export GramMatrix, ChebyshevGramMatrix
 
-export gaunt
+include("GramMatrix.jl")
 
-include("gaunt.jl")
+export weightedhermitetransform, iweightedhermitetransform
+
+include("hermite.jl")
 
 export sphones, sphzeros, sphrand, sphrandn, sphevaluate,
        sphvones, sphvzeros, sphvrand, sphvrandn,
        diskones, diskzeros, diskrand, diskrandn,
+       rectdiskones, rectdiskzeros, rectdiskrand, rectdiskrandn,
        triones, trizeros, trirand, trirandn, trievaluate,
-       tetones, tetzeros, tetrand, tetrandn
-
-lgamma(x) = logabsgamma(x)[1]
+       tetones, tetzeros, tetrand, tetrandn,
+       spinsphones, spinsphzeros, spinsphrand, spinsphrandn
 
 include("specialfunctions.jl")
 
+include("toeplitzplans.jl")
+include("toeplitzhankel.jl")
+
+export ToeplitzPlusHankel
+
+include("ToeplitzPlusHankel.jl")
+
+# following use libfasttransforms by default
+for f in (:jac2jac,
+    :lag2lag, :jac2ultra, :ultra2jac, :jac2cheb,
+    :cheb2jac, :ultra2cheb, :cheb2ultra, :associatedjac2jac,
+    :modifiedjac2jac, :modifiedlag2lag, :modifiedherm2herm,
+    :sph2fourier, :sphv2fourier, :disk2cxf, :ann2cxf,
+    :rectdisk2cheb, :tri2cheb, :tet2cheb,
+    :leg2cheb, :cheb2leg, :ultra2ultra)
+    lib_f = Symbol("lib_", f)
+    @eval $f(x::AbstractArray, y...; z...) = $lib_f(x, y...; z...)
+end
+
+include("arrays.jl")
+# following use Toeplitz-Hankel to avoid expensive plans
+# for f in (:leg2cheb, :cheb2leg, :ultra2ultra)
+#     th_f = Symbol("th_", f)
+#     lib_f = Symbol("lib_", f)
+#     @eval begin
+#         $f(x::AbstractArray, y...; z...) = $th_f(x, y...; z...)
+#         # $f(x::AbstractArray, y...; z...) = $lib_f(x, y...; z...)
+#     end
+# end
+
+include("docstrings.jl")
+
 end # module
diff --git a/src/GramMatrix.jl b/src/GramMatrix.jl
new file mode 100644
index 00000000..75f0cc22
--- /dev/null
+++ b/src/GramMatrix.jl
@@ -0,0 +1,411 @@
+abstract type AbstractGramMatrix{T} <: LayoutMatrix{T} end
+
+@inline issymmetric(G::AbstractGramMatrix) = true
+@inline isposdef(G::AbstractGramMatrix) = true
+
+struct GramMatrix{T, WT <: AbstractMatrix{T}, XT <: AbstractMatrix{T}} <: AbstractGramMatrix{T}
+    W::WT
+    X::XT
+    function GramMatrix{T, WT, XT}(W::WT, X::XT) where {T, WT, XT}
+        if size(W) ≠ size(X)
+            throw(ArgumentError("Cannot construct a GramMatrix with W and X of different sizes."))
+        end
+        if !issymmetric(W)
+            throw(ArgumentError("Cannot construct a GramMatrix with a nonsymmetric W."))
+        end
+        if bandwidths(X) ≠ (1, 1)
+            throw(ArgumentError("Cannot construct a GramMatrix with a nontridiagonal X."))
+        end
+        new{T, WT, XT}(W, X)
+    end
+end
+
+"""
+    GramMatrix(W::AbstractMatrix, X::AbstractMatrix)
+
+Construct a symmetric positive-definite Gram matrix with data stored in ``W``.
+Given a family of orthogonal polynomials ``𝐏(x) = {p₀(x), p₁(x),…}``
+and a continuous inner product ``⟨f, g⟩``, the Gram matrix is defined by:
+```math
+W[i, j] = ⟨p_{i-1}, p_{j-1}⟩.
+```
+Moreover, given ``X``, the transposed Jacobi matrix that satisfies ``x 𝐏(x) = 𝐏(x) X``,
+the Gram matrix satisfies the skew-symmetric rank-2 displacement equation (``X = X[1:n, 1:n]``):
+```math
+XᵀW - WX = GJGᵀ,
+```
+where ``J = [0 1; -1 0]`` and where:
+```math
+G[:, 1] = 𝐞_n, \\quad  G[:, 2] = W[n-1, :]X[n-1, n] - Xᵀ W[:, n].
+```
+Fast (``O(n^2)``) Cholesky factorization of the Gram matrix returns the
+connection coefficients between ``𝐏(x)`` and the polynomials ``𝐐(x)``
+orthogonal in the modified inner product, ``𝐏(x) = 𝐐(x) R``.
+
+See also [`ChebyshevGramMatrix`](@ref) for a special case.
+
+> K. Gumerov, S. Rigg, and R. M. Slevinsky, [Fast measure modification of orthogonal polynomials via matrices with displacement structure](https://arxiv.org/abs/2412.17663), arXiv:2412.17663, 2024.
+"""
+GramMatrix(W::WT, X::XT) where {T, WT <: AbstractMatrix{T}, XT <: AbstractMatrix{T}} = GramMatrix{T, WT, XT}(W, X)
+
+@inline size(G::GramMatrix) = size(G.W)
+@inline getindex(G::GramMatrix, i::Integer, j::Integer) = G.W[i, j]
+@inline bandwidths(G::GramMatrix) = bandwidths(G.W)
+@inline MemoryLayout(G::GramMatrix) = MemoryLayout(G.W)
+@inline rowsupport(G::GramMatrix, j) = rowsupport(MemoryLayout(G), G.W, j)
+@inline colsupport(G::GramMatrix, j) = colsupport(MemoryLayout(G), G.W, j)
+
+"""
+    GramMatrix(μ::AbstractVector, X::AbstractMatrix)
+
+Construct a GramMatrix from modified orthogonal polynomial moments and the multiplication operator.
+In the standard (classical) normalization, ``p₀(x) = 1``, so that the moments
+``µ[n] = ⟨ pₙ₋₁, 1⟩`` are in fact the first column of the Gram matrix.
+The recurrence is built from ``XᵀW = WX``.
+"""
+GramMatrix(μ::AbstractVector{T}, X::XT) where {T, XT <: AbstractMatrix{T}} = GramMatrix(μ, X, one(T))
+function GramMatrix(μ::AbstractVector{T}, X::XT, p0::T) where {T, XT <: AbstractMatrix{T}}
+    N = length(μ)
+    n = (N+1)÷2
+    @assert N == size(X, 1) == size(X, 2)
+    @assert bandwidths(X) == (1, 1)
+    W = LowerTriangular(Matrix{T}(undef, N, N))
+    if n > 0
+        @inbounds for m in 1:N
+            W[m, 1] = p0*μ[m]
+        end
+    end
+    if n > 1
+        @inbounds for m in 2:N-1
+            W[m, 2] = (X[m-1, m]*W[m-1, 1] + (X[m, m]-X[1, 1])*W[m, 1] + X[m+1, m]*W[m+1, 1])/X[2, 1]
+        end
+    end
+    @inbounds @simd for n in 3:n
+        for m in n:N-n+1
+            W[m, n] = (X[m-1, m]*W[m-1, n-1] + (X[m, m]-X[n-1, n-1])*W[m, n-1] + X[m+1, m]*W[m+1, n-1] - X[n-2, n-1]*W[m, n-2])/X[n, n-1]
+        end
+    end
+    return GramMatrix(Symmetric(W[1:n, 1:n], :L), eval(XT.name.name)(view(X, 1:n, 1:n)))
+end
+
+function GramMatrix(μ::PaddedVector{T}, X::XT, p0::T) where {T, XT <: AbstractMatrix{T}}
+    N = length(μ)
+    b = length(μ.args[2])-1
+    n = (N+1)÷2
+    @assert N == size(X, 1) == size(X, 2)
+    @assert bandwidths(X) == (1, 1)
+    W = BandedMatrix{T}(undef, (N, N), (b, 0))
+    if n > 0
+        @inbounds for m in 1:min(N, b+1)
+            W[m, 1] = p0*μ[m]
+        end
+    end
+    if n > 1
+        @inbounds for m in 2:min(N-1, b+2)
+            W[m, 2] = (X[m-1, m]*W[m-1, 1] + (X[m, m]-X[1, 1])*W[m, 1] + X[m+1, m]*W[m+1, 1])/X[2, 1]
+        end
+    end
+    @inbounds @simd for n in 3:n
+        for m in n:min(N-n+1, b+n)
+            W[m, n] = (X[m-1, m]*W[m-1, n-1] + (X[m, m]-X[n-1, n-1])*W[m, n-1] + X[m+1, m]*W[m+1, n-1] - X[n-2, n-1]*W[m, n-2])/X[n, n-1]
+        end
+    end
+    return GramMatrix(Symmetric(W[1:n, 1:n], :L), eval(XT.name.name)(view(X, 1:n, 1:n)))
+end
+
+"""
+    GramMatrix(cnm1::AbstractVector, cn::AbstractVector, X::AbstractMatrix)
+
+Construct a GramMatrix from its last two columns and the multiplication operator.
+The recurrence is built from ``XᵀW = WX`` and is used in case the moment method is unstable (such as with Laguerre).
+"""
+function GramMatrix(cnm1::AbstractVector{T}, cn::AbstractVector{T}, X::XT) where {T, XT <: AbstractMatrix{T}}
+    N = length(cn)
+    @assert N == length(cnm1) == size(X, 1) == size(X, 2)
+    @assert bandwidths(X) == (1, 1)
+    W = Matrix{T}(undef, N, N)
+    if N > 0
+        @inbounds for m in 1:N
+            W[N, m] = W[m, N] = cn[m]
+        end
+    end
+    if N > 1
+        @inbounds for m in 1:N
+            W[N-1, m] = W[m, N-1] = cnm1[m]
+        end
+    end
+    @inbounds @simd for n in N:-1:3
+        W[1, n-2]  = ((X[1, 1]-X[n-1, n-1])*W[1, n-1] + X[2, 1]*W[2, n-1] - X[n, n-1]*W[1, n])/X[n-2, n-1]
+        for m in 2:n-2
+            W[m, n-2]  = (X[m-1, m]*W[m-1, n-1] + (X[m, m]-X[n-1, n-1])*W[m, n-1] + X[m+1, m]*W[m+1, n-1] - X[n, n-1]*W[m, n])/X[n-2, n-1]
+        end
+        for m in n-1:N-2
+            W[m, n-2] = W[n-2, m]
+        end
+    end
+    return GramMatrix(W, X)
+end
+
+#
+# X'W-W*X = G*J*G'
+# This returns G, where J = [0 1; -1 0], respecting the skew-symmetry of the right-hand side.
+#
+function compute_skew_generators(W::GramMatrix{T}) where T
+    X = W.X
+    n = size(W, 1)
+    G = zeros(T, n, 2)
+    G[n, 1] = one(T)
+    G[:, 2] .= W[:, n-1]*X[n-1, n] + W[:, n]*X[n, n] - X'W[:, n]
+    return G
+end
+
+function cholesky(W::GramMatrix{T}) where T
+    cholesky(MemoryLayout(W), W)
+end
+
+function cholesky(_, W::GramMatrix{T}) where T
+    n = size(W, 1)
+    G = compute_skew_generators(W)
+    L = zeros(T, n, n)
+    c = W[:, 1]
+    ĉ = zeros(T, n)
+    l = zeros(T, n)
+    v = zeros(T, n)
+    row1 = zeros(T, n)
+    fastcholesky!(L, W.X, G, c, ĉ, l, v, row1, n)
+    return Cholesky(L, 'L', 0)
+end
+
+function fastcholesky!(L::Matrix{T}, X, G, c, ĉ, l, v, row1, n) where T
+    @inbounds @simd for k in 1:n-1
+        d = sqrt(c[k])
+        for j in k:n
+            L[j, k] = l[j] = c[j]/d
+        end
+        for j in k:n
+            v[j] = G[j, 1]*G[k, 2] - G[j, 2]*G[k, 1]
+        end
+        for j in k+1:n-1
+            ĉ[j] = (X[j-1, j]*c[j-1] + (X[j, j]-X[k, k])*c[j] + X[j+1, j]*c[j+1] + c[k]*row1[j] - row1[k]*c[j] - v[j])/X[k+1, k]
+        end
+        ĉ[n] = (X[n-1, n]*c[n-1] + (X[n, n]-X[k, k])*c[n] + c[k]*row1[n] - row1[k]*c[n] - v[n])/X[k+1, k]
+        cst = X[k+1, k]/d
+        for j in k+1:n
+            row1[j] = -cst*l[j]
+        end
+        cst = c[k+1]/d
+        for j in k:n
+            c[j] = ĉ[j] - cst*l[j]
+        end
+        gd1 = G[k, 1]/d
+        gd2 = G[k, 2]/d
+        for j in k:n
+            G[j, 1] -= l[j]*gd1
+            G[j, 2] -= l[j]*gd2
+        end
+    end
+    L[n, n] = sqrt(c[n])
+end
+
+function cholesky(::Union{AbstractBandedLayout, SymmetricLayout{<: AbstractBandedLayout}}, W::GramMatrix{T}) where T
+    n = size(W, 1)
+    G = compute_skew_generators(W)
+    L = BandedMatrix{T}(undef, (n, n), (bandwidth(W, 1), 0))
+    c = W[:, 1]
+    ĉ = zeros(T, n)
+    l = zeros(T, n)
+    v = zeros(T, n)
+    row1 = zeros(T, n)
+    fastcholesky!(L, W.X, G, c, ĉ, l, v, row1, n)
+    return Cholesky(L, 'L', 0)
+end
+
+function fastcholesky!(L::BandedMatrix{T}, X, G, c, ĉ, l, v, row1, n) where T
+    b = bandwidth(L, 1)
+    @inbounds @simd for k in 1:n-1
+        d = sqrt(c[k])
+        for j in k:min(k+b, n)
+            L[j, k] = l[j] = c[j]/d
+        end
+        for j in max(k, n-b-1):n
+            v[j] = G[j, 1]*G[k, 2] - G[j, 2]*G[k, 1]
+        end
+        for j in k+1:min(k+b+1, n-1)
+            ĉ[j] = (X[j-1, j]*c[j-1] + (X[j, j]-X[k, k])*c[j] + X[j+1, j]*c[j+1] + c[k]*row1[j] - row1[k]*c[j] - v[j])/X[k+1, k]
+        end
+        if k ≥ n-b-1
+            ĉ[n] = (X[n-1, n]*c[n-1] + (X[n, n]-X[k, k])*c[n] + c[k]*row1[n] - row1[k]*c[n] - v[n])/X[k+1, k]
+        end
+        cst = X[k+1, k]/d
+        for j in k+1:min(k+b+1, n)
+            row1[j] = -cst*l[j]
+        end
+        cst = c[k+1]/d
+        for j in k:min(k+b+1, n)
+            c[j] = ĉ[j] - cst*l[j]
+        end
+        gd1 = G[k, 1]/d
+        gd2 = G[k, 2]/d
+        for j in max(k, n-b-1):n
+            G[j, 1] -= l[j]*gd1
+            G[j, 2] -= l[j]*gd2
+        end
+    end
+    L[n, n] = sqrt(c[n])
+end
+
+struct ChebyshevGramMatrix{T, V <: AbstractVector{T}} <: AbstractGramMatrix{T}
+    μ::V
+    n::Int
+end
+
+"""
+    ChebyshevGramMatrix(μ::AbstractVector)
+
+Construct a Chebyshev--Gram matrix of size `(length(μ)+1)÷2` with entries:
+```math
+2 W[i, j] = µ[|i-j|+1] + µ[i+j-1].
+```
+Due to the linearization of a product of two first-kind Chebyshev polynomials,
+the Chebyshev--Gram matrix can be constructed from modified Chebyshev moments:
+```math
+µ[n] = ⟨ Tₙ₋₁, 1⟩.
+```
+Specialized construction and Cholesky factorization is given for this type.
+
+See also [`GramMatrix`](@ref) for the general case.
+"""
+function ChebyshevGramMatrix(μ::V) where {T, V <: AbstractVector{T}}
+    n = (length(μ)+1)÷2
+    ChebyshevGramMatrix{T, V}(μ, n)
+end
+
+@inline size(G::ChebyshevGramMatrix) = (G.n, G.n)
+@inline getindex(G::ChebyshevGramMatrix, i::Integer, j::Integer) = (G.μ[abs(i-j)+1] + G.μ[i+j-1])/2
+@inline bandwidths(G::ChebyshevGramMatrix{T, <: PaddedVector{T}}) where T = (length(G.μ.args[2])-1, length(G.μ.args[2])-1)
+@inline MemoryLayout(G::ChebyshevGramMatrix{T, <: PaddedVector{T}}) where T = BandedLayout()
+
+#
+# 2X'W-W*2X = G*J*G'
+# This returns G, where J = [0 1; -1 0], respecting the skew-symmetry of the right-hand side.
+# We use twice the Chebybshev Jacobi matrix so that subsequent arithmetic is easier.
+#
+function compute_skew_generators(W::ChebyshevGramMatrix{T}) where T
+    μ = W.μ
+    n = size(W, 1)
+    G = zeros(T, n, 2)
+    G[n, 1] = one(T)
+    @inbounds @simd for j in 1:n-1
+        G[j, 2] = -(μ[n+2-j] + μ[n+j])/2
+    end
+    G
+end
+
+function cholesky(W::ChebyshevGramMatrix{T}) where T
+    n = size(W, 1)
+    G = compute_skew_generators(W)
+    L = zeros(T, n, n)
+    c = W[:, 1]
+    ĉ = zeros(T, n)
+    l = zeros(T, n)
+    v = zeros(T, n)
+    row1 = zeros(T, n)
+    fastcholesky!(L, G, c, ĉ, l, v, row1, n)
+    return Cholesky(L, 'L', 0)
+end
+
+function fastcholesky!(L::Matrix{T}, G, c, ĉ, l, v, row1, n) where T
+    @inbounds @simd for k in 1:n-1
+        d = sqrt(c[k])
+        for j in k:n
+            L[j, k] = l[j] = c[j]/d
+        end
+        for j in k:n
+            v[j] = G[j, 1]*G[k, 2] - G[j, 2]*G[k, 1]
+        end
+        if k == 1
+            for j in 2:n-1
+                ĉ[j] = (c[j+1] + c[j-1] + c[1]*row1[j] - row1[1]*c[j] - v[j])/2
+            end
+            ĉ[n] = (c[n-1] + c[1]*row1[n] - row1[1]*c[n] - v[n])/2
+            cst = 2/d
+        else
+            for j in k+1:n-1
+                ĉ[j] = c[j+1] + c[j-1] + c[k]*row1[j] - row1[k]*c[j] - v[j]
+            end
+            ĉ[n] = c[n-1] + c[k]*row1[n] - row1[k]*c[n] - v[n]
+            cst = 1/d
+        end
+        for j in k+1:n
+            row1[j] = -cst*l[j]
+        end
+        cst = c[k+1]/d
+        for j in k:n
+            c[j] = ĉ[j] - cst*l[j]
+        end
+        gd1 = G[k, 1]/d
+        gd2 = G[k, 2]/d
+        for j in k:n
+            G[j, 1] -= l[j]*gd1
+            G[j, 2] -= l[j]*gd2
+        end
+    end
+    L[n, n] = sqrt(c[n])
+end
+
+function cholesky(W::ChebyshevGramMatrix{T, <: PaddedVector{T}}) where T
+    n = size(W, 1)
+    G = compute_skew_generators(W)
+    L = BandedMatrix{T}(undef, (n, n), (bandwidth(W, 1), 0))
+    c = W[:, 1]
+    ĉ = zeros(T, n)
+    l = zeros(T, n)
+    v = zeros(T, n)
+    row1 = zeros(T, n)
+    fastcholesky!(L, G, c, ĉ, l, v, row1, n)
+    return Cholesky(L, 'L', 0)
+end
+
+function fastcholesky!(L::BandedMatrix{T}, G, c, ĉ, l, v, row1, n) where T
+    b = bandwidth(L, 1)
+    @inbounds @simd for k in 1:n-1
+        d = sqrt(c[k])
+        for j in k:min(k+b, n)
+            L[j, k] = l[j] = c[j]/d
+        end
+        for j in max(k, n-b-1):n
+            v[j] = G[j, 1]*G[k, 2] - G[j, 2]*G[k, 1]
+        end
+        if k == 1
+            for j in 2:min(b+2, n-1)
+                ĉ[j] = (c[j+1] + c[j-1] + c[1]*row1[j] - row1[1]*c[j] - v[j])/2
+            end
+            if 1 ≥ n-b-1
+                ĉ[n] = (c[n-1] + c[1]*row1[n] - row1[1]*c[n] - v[n])/2
+            end
+            cst = 2/d
+        else
+            for j in k+1:min(k+b+1, n-1)
+                ĉ[j] = c[j+1] + c[j-1] + c[k]*row1[j] - row1[k]*c[j] - v[j]
+            end
+            if k ≥ n-b-1
+                ĉ[n] = c[n-1] + c[k]*row1[n] - row1[k]*c[n] - v[n]
+            end
+            cst = 1/d
+        end
+        for j in k+1:min(k+b+1, n)
+            row1[j] = -cst*l[j]
+        end
+        cst = c[k+1]/d
+        for j in k:min(k+b+1, n)
+            c[j] = ĉ[j] - cst*l[j]
+        end
+        gd1 = G[k, 1]/d
+        gd2 = G[k, 2]/d
+        for j in max(k, n-b-1):n
+            G[j, 1] -= l[j]*gd1
+            G[j, 2] -= l[j]*gd2
+        end
+    end
+    L[n, n] = sqrt(c[n])
+end
diff --git a/src/PaduaTransform.jl b/src/PaduaTransform.jl
index 9cba3251..c7670924 100644
--- a/src/PaduaTransform.jl
+++ b/src/PaduaTransform.jl
@@ -1,5 +1,10 @@
 
 # lex indicates if its lexigraphical (i.e., x, y) or reverse (y, x)
+# If in lexigraphical order the coefficient vector's entries
+# corrrespond to the following basis polynomials:
+# [T0(x) * T0(y), T1(x) * T0(y), T0(x) * T1(y), T2(x) * T0(y), T1(x) * T1(y), T0(x) * T2(y), ...]
+# else, if not in lexigraphical order:
+# [T0(x) * T0(y), T0(x) * T1(y), T1(x) * T0(y), T0(x) * T2(y), T1(x) * T1(y), T2(x) * T0(y), ...]
 """
 Pre-plan an Inverse Padua Transform.
 """
@@ -118,7 +123,7 @@ Pre-plan a Padua Transform.
 """
 function plan_paduatransform!(::Type{T},N::Integer,lex) where T
     n=Int(cld(-3+sqrt(1+8N),2))
-    if N ≠ ((n+1)*(n+2))÷2
+    if N ≠ ((n+1)*(n+2))÷2
         error("Padua transforms can only be applied to vectors of length (n+1)*(n+2)/2.")
     end
     PaduaTransformPlan(Array{T}(undef,n+2,n+1),FFTW.plan_r2r!(Array{T}(undef,n+2,n+1),FFTW.REDFT00),lex)
@@ -204,21 +209,46 @@ function paduapoints(::Type{T}, n::Integer) where T
     MM=Matrix{T}(undef,N,2)
     m=0
     delta=0
-    NN=fld(n+2,2)
-    @inbounds for k=n:-1:0
-        if isodd(n)>0
-            delta=mod(k,2)
+    NN=div(n,2)+1
+    # x coordinates
+    for k=n:-1:0
+        if isodd(n)
+            delta = Int(isodd(k))
         end
+        x = -cospi(T(k)/n)
         @inbounds for j=NN+delta:-1:1
             m+=1
-            MM[m,1]=sinpi(T(k)/n-T(0.5))
-            if isodd(n-k)>0
-                MM[m,2]=sinpi((2j-one(T))/(n+1)-T(0.5))
+            MM[m,1]=x
+        end
+    end
+    # y coordinates
+    # populate the first two sets, and copy the rest
+    m=0
+    for k=n:-1:n-1
+        if isodd(n)
+            delta = Int(isodd(k))
+        end
+        for j=NN+delta:-1:1
+            m+=1
+            @inbounds if isodd(n-k)
+                MM[m,2]=-cospi((2j-one(T))/(n+1))
             else
-                MM[m,2]=sinpi(T(2j-2)/(n+1)-T(0.5))
+                MM[m,2]=-cospi(T(2j-2)/(n+1))
             end
         end
     end
+    m += 1
+    # number of y coordinates between k=n and k=n-2
+    Ny_shift = 2NN+isodd(n)
+    for k in n-2:-1:0
+        if isodd(n)
+            delta = Int(isodd(k))
+        end
+        for j in range(m, length=NN+delta)
+            @inbounds MM[j,2] = MM[j-Ny_shift,2]
+        end
+        m += NN+delta
+    end
     return MM
 end
 
diff --git a/src/ToeplitzPlusHankel.jl b/src/ToeplitzPlusHankel.jl
new file mode 100644
index 00000000..535184b7
--- /dev/null
+++ b/src/ToeplitzPlusHankel.jl
@@ -0,0 +1,315 @@
+struct ToeplitzPlusHankel{T, S, P1 <: Plan{S}, P2 <: Plan{S}} <: AbstractMatrix{T}
+    tc::Vector{T}
+    tr::Vector{T}
+    h::Vector{T}
+    th_dft::Matrix{S}
+    tht_dft::Matrix{S}
+    temp::Matrix{S}
+    plan::P1
+    iplan::P2
+    size::NTuple{2, Int}
+end
+
+# enforces tr[1] == tc[1]
+function ToeplitzPlusHankel(tc::Vector{T}, tr::Vector{T}, h::Vector{T}) where T
+    m = length(tc)
+    n = length(tr)
+    @assert length(h) == m+n-1
+    tr[1] = tc[1]
+    mn = m+n
+    S = promote_type(float(T), Complex{Float32})
+    th_dft = Matrix{S}(undef, mn, 2)
+    copyto!(th_dft, 1, tc, 1, m)
+    th_dft[m+1, 1] = zero(T)
+    copyto!(th_dft, m+2, Iterators.reverse(tr), 1, n-1)
+    copyto!(th_dft, mn+1, h, n, m)
+    th_dft[m+1, 2] = zero(T)
+    copyto!(th_dft, mn+m+2, h, 1, n-1)
+    tht_dft = Matrix{S}(undef, mn, 2)
+    copyto!(tht_dft, 1, tr, 1, n)
+    tht_dft[n+1, 1] = zero(T)
+    copyto!(tht_dft, n+2, Iterators.reverse(tc), 1, m-1)
+    copyto!(tht_dft, mn+1, h, m, n)
+    tht_dft[n+1, 2] = zero(T)
+    copyto!(tht_dft, mn+n+2, h, 1, m-1)
+
+    plan = plan_fft!(th_dft, 1)
+    plan*th_dft
+    plan*tht_dft
+    temp = zeros(S, mn, 2)
+    iplan = inv(plan)
+
+    ToeplitzPlusHankel{T, S, typeof(plan), typeof(iplan)}(tc, tr, h, th_dft, tht_dft, temp, plan, iplan, (m, n))
+end
+
+# A ChebyshevGramMatrix isa (symmetric positive-definite) ToeplitzPlusHankel matrix.
+function ToeplitzPlusHankel(G::ChebyshevGramMatrix)
+    n = size(G, 1)
+    ToeplitzPlusHankel(G.μ[1:n]/2, G.μ[1:n]/2, G.μ/2)
+end
+
+size(A::ToeplitzPlusHankel) = A.size
+getindex(A::ToeplitzPlusHankel, i::Integer, j::Integer) = (i ≥ j ? A.tc[i-j+1] : A.tr[j-i+1]) + A.h[i+j-1]
+
+# A view of a T+H is also T+H.
+function getindex(A::ToeplitzPlusHankel, ir::UnitRange{Int}, jr::UnitRange{Int})
+    fir, lir = first(ir), last(ir)
+    fjr, ljr = first(jr), last(jr)
+    if fir ≥ fjr
+        tc = A.tc[fir-fjr+1:lir-fjr+1]
+        tr = [A.tc[fir-fjr+1:-1:max(1, fir-ljr+1)]; A.tr[2:ljr-fir+1]]
+    else
+        tc = [A.tr[fjr-fir+1:-1:max(1, fjr-lir+1)]; A.tc[2:lir-fjr+1]]
+        tr = A.tr[fjr-fir+1:ljr-fir+1]
+    end
+    ToeplitzPlusHankel(tc, tr, A.h[fir+fjr-1:lir+ljr-1])
+end
+
+
+# y ← A x α + y β
+function mul!(y::StridedVector{T}, A::ToeplitzPlusHankel{T}, x::StridedVector{T}, α::S, β::S) where {T <: Real, S <: Real}
+    m, n = size(A)
+    @assert m == length(y)
+    @assert n == length(x)
+    mn = m+n
+    th_dft = A.th_dft
+    temp = A.temp
+    plan = A.plan
+    iplan = A.iplan
+
+    copyto!(temp, 1, x, 1, n)
+    copyto!(temp, mn+1, Iterators.reverse(x), 1, n)
+    @inbounds for j in n+1:mn
+        temp[j, 1] = zero(T)
+        temp[j, 2] = zero(T)
+    end
+    plan*temp
+    temp .*= th_dft
+    iplan*temp
+
+    if iszero(β)
+        @inbounds @simd for i in 1:m
+            y[i] = α * (real(temp[i, 1])+real(temp[i, 2]))
+        end
+    else
+        @inbounds @simd for i in 1:m
+            y[i] = α * (real(temp[i, 1])+real(temp[i, 2])) + β*y[i]
+        end
+    end
+    return y
+end
+
+# y ← A' x α + y β
+function mul!(y::StridedVector{T}, A::Adjoint{T, <:ToeplitzPlusHankel{T}}, x::StridedVector{T}, α::S, β::S) where {T <: Real, S <: Real}
+    m, n = size(A)
+    @assert m == length(y)
+    @assert n == length(x)
+    mn = m+n
+    AP = A.parent
+    tht_dft = AP.tht_dft
+    temp = AP.temp
+    plan = AP.plan
+    iplan = AP.iplan
+
+    copyto!(temp, 1, x, 1, n)
+    copyto!(temp, mn+1, Iterators.reverse(x), 1, n)
+    @inbounds for j in n+1:mn
+        temp[j, 1] = zero(T)
+        temp[j, 2] = zero(T)
+    end
+    plan*temp
+    temp .*= tht_dft
+    iplan*temp
+
+    if iszero(β)
+        @inbounds @simd for i in 1:m
+            y[i] = α * (real(temp[i, 1])+real(temp[i, 2]))
+        end
+    else
+        @inbounds @simd for i in 1:m
+            y[i] = α * (real(temp[i, 1])+real(temp[i, 2])) + β*y[i]
+        end
+    end
+    return y
+end
+
+
+# C ← A B α + C β
+function mul!(C::StridedMatrix{T}, A::ToeplitzPlusHankel{T}, B::StridedMatrix{T}, α::S, β::S) where {T <: Real, S <: Real}
+    m, n = size(A)
+    @assert m == size(C, 1)
+    @assert n == size(B, 1)
+    p = size(B, 2)
+    if size(C, 2) != p
+        throw(DimensionMismatch("input and output matrices must have same number of columns"))
+    end
+
+    th_dft = A.th_dft
+    TC = promote_type(float(T), Complex{Float32})
+    temp = zeros(TC, m+n, 2p)
+    plan = plan_fft!(temp, 1)
+
+    for k in 1:p
+        copyto!(view(temp, :, 2k-1), 1, view(B, :, k), 1, n)
+        copyto!(view(temp, :, 2k), 1, Iterators.reverse(view(B, :, k)), 1, n)
+    end
+    plan*temp
+    for k in 1:p
+        vt = view(temp, :, 2k-1:2k)
+        vt .*= th_dft
+    end
+    plan\temp
+
+    if iszero(β)
+        @inbounds for k in 1:p
+            for i in 1:m
+                C[i, k] = α * (real(temp[i, 2k-1])+real(temp[i, 2k]))
+            end
+        end
+    else
+        @inbounds for k in 1:p
+            for i in 1:m
+                C[i, k] = α * (real(temp[i, 2k-1])+real(temp[i, 2k])) + β*C[i, k]
+            end
+        end
+    end
+    return C
+end
+
+# Morally equivalent to mul!(C', B', A', α, β)' with StridedMatrix replaced by AbstractMatrix below
+function mul!(C::StridedMatrix{T}, A::StridedMatrix{T}, B::ToeplitzPlusHankel{T}, α::S, β::S) where {T <: Real, S <: Real}
+    n, m = size(B)
+    @assert m == size(C, 2)
+    @assert n == size(A, 2)
+    p = size(A, 1)
+    if size(C, 1) != p
+        throw(DimensionMismatch("input and output matrices must have same number of rows"))
+    end
+
+    tht_dft = B.tht_dft
+    TC = promote_type(float(T), Complex{Float32})
+    temp = zeros(TC, m+n, 2p)
+    plan = plan_fft!(temp, 1)
+
+    for k in 1:p
+        copyto!(view(temp, :, 2k-1), 1, view(A, k, :), 1, n)
+        copyto!(view(temp, :, 2k), 1, Iterators.reverse(view(A, k, :)), 1, n)
+    end
+    plan*temp
+    for k in 1:p
+        vt = view(temp, :, 2k-1:2k)
+        vt .*= tht_dft
+    end
+    plan\temp
+
+    if iszero(β)
+        @inbounds for k in 1:p
+            for i in 1:m
+                C[k, i] = α * (real(temp[i, 2k-1])+real(temp[i, 2k]))
+            end
+        end
+    else
+        @inbounds for k in 1:p
+            for i in 1:m
+                C[k, i] = α * (real(temp[i, 2k-1])+real(temp[i, 2k])) + β*C[k, i]
+            end
+        end
+    end
+    return C
+end
+
+# C ← A' B α + C β
+function mul!(C::StridedMatrix{T}, A::Adjoint{T, <:ToeplitzPlusHankel{T}}, B::StridedMatrix{T}, α::S, β::S) where {T <: Real, S <: Real}
+    m, n = size(A)
+    @assert m == size(C, 1)
+    @assert n == size(B, 1)
+    p = size(B, 2)
+    if size(C, 2) != p
+        throw(DimensionMismatch("input and output matrices must have same number of columns"))
+    end
+
+    tht_dft = A.parent.tht_dft
+    TC = promote_type(float(T), Complex{Float32})
+    temp = zeros(TC, m+n, 2p)
+    plan = plan_fft!(temp, 1)
+
+    for k in 1:p
+        copyto!(view(temp, :, 2k-1), 1, view(B, :, k), 1, n)
+        copyto!(view(temp, :, 2k), 1, Iterators.reverse(view(B, :, k)), 1, n)
+    end
+    plan*temp
+    for k in 1:p
+        vt = view(temp, :, 2k-1:2k)
+        vt .*= tht_dft
+    end
+    plan\temp
+
+    if iszero(β)
+        @inbounds for k in 1:p
+            for i in 1:m
+                C[i, k] = α * (real(temp[i, 2k-1])+real(temp[i, 2k]))
+            end
+        end
+    else
+        @inbounds for k in 1:p
+            for i in 1:m
+                C[i, k] = α * (real(temp[i, 2k-1])+real(temp[i, 2k])) + β*C[i, k]
+            end
+        end
+    end
+    return C
+end
+
+# Estimate the Frobenius norm of the Toeplitz-plus-Hankel matrix by working with the symbols.
+function normest(A::ToeplitzPlusHankel{T}) where T
+    m, n = size(A)
+    tc = A.tc
+    tr = A.tr
+    h = A.h
+    ret1 = zero(T)
+    ret2 = zero(T)
+    if m == min(m, n)
+        for i = 1:m
+            ret1 += (m+1-i)*abs2(tc[i])
+        end
+        for i = 2:n-m
+            ret1 += m*abs2(tr[i])
+        end
+        for i = max(n-m+1, 2):n
+            ret1 += (n+1-i)*abs2(tr[i])
+        end
+        for i = 1:m
+            ret2 += i*abs2(h[i])
+        end
+        for i = m+1:n
+            ret2 += m*abs2(h[i])
+        end
+        for i = n+1:m+n-1
+            ret2 += (m+n-i)*abs2(h[i])
+        end
+    else
+        for i = 1:n
+            ret1 += (n+1-i)*abs2(tr[i])
+        end
+        for i = 2:m-n
+            ret1 += n*abs2(tc[i])
+        end
+        for i = max(m-n+1, 2):m
+            ret1 += (m+1-i)*abs2(tc[i])
+        end
+        for i = 1:n
+            ret2 += i*abs2(h[i])
+        end
+        for i = n+1:m
+            ret2 += n*abs2(h[i])
+        end
+        for i = m+1:m+n-1
+            ret2 += (m+n-i)*abs2(h[i])
+        end
+    end
+    sqrt(ret1) + sqrt(ret2)
+end
+
+normest(A::Symmetric{T, <: ToeplitzPlusHankel{T}}) where T = normest(parent(A))
+normest(A::Hermitian{T, <: ToeplitzPlusHankel{T}}) where T = normest(parent(A))
+normest(A::ChebyshevGramMatrix{T}) where T = normest(ToeplitzPlusHankel(A))
diff --git a/src/arrays.jl b/src/arrays.jl
new file mode 100644
index 00000000..5472e736
--- /dev/null
+++ b/src/arrays.jl
@@ -0,0 +1,86 @@
+struct ArrayPlan{T, FF<:FTPlan{<:T}, Szs<:Tuple, Dims<:Tuple{<:Int}} <: Plan{T}
+    F::FF
+    szs::Szs
+    dims::Dims
+end
+size(P::ArrayPlan) = P.szs
+
+function ArrayPlan(F::FTPlan{<:T}, c::AbstractArray{T}, dims::Tuple{<:Int}=(1,)) where T
+    szs = size(c)
+    @assert F.n == szs[dims[1]]
+    ArrayPlan(F, size(c), dims)
+end
+
+function *(P::ArrayPlan, f::AbstractArray)
+    F, dims, szs = P.F, P.dims, P.szs
+    @assert length(dims) == 1
+    @assert szs == size(f)
+    d = first(dims)
+
+    perm = (d, ntuple(i-> i + (i >= d), ndims(f) -1)...)
+    fp = permutedims(f, perm)
+
+    fr = reshape(fp, size(fp,1), :)
+
+    permutedims(reshape(F*fr, size(fp)...), invperm(perm))
+end
+
+function \(P::ArrayPlan, f::AbstractArray)
+    F, dims, szs = P.F, P.dims, P.szs
+    @assert length(dims) == 1
+    @assert szs == size(f)
+    d = first(dims)
+
+    perm = (d, ntuple(i-> i + (i >= d), ndims(f) -1)...)
+    fp = permutedims(f, perm)
+
+    fr = reshape(fp, size(fp,1), :)
+
+    permutedims(reshape(F\fr, size(fp)...), invperm(perm))
+end
+
+struct NDimsPlan{T, FF<:ArrayPlan{<:T}, Szs<:Tuple, Dims<:Tuple} <: Plan{T}
+    F::FF
+    szs::Szs
+    dims::Dims
+    function NDimsPlan(F, szs, dims)
+        if length(Set(szs[[dims...]])) > 1
+            error("Different size in dims axes not yet implemented in N-dimensional transform.")
+        end
+        new{eltype(F), typeof(F), typeof(szs), typeof(dims)}(F, szs, dims)
+    end
+end
+
+size(P::NDimsPlan) = P.szs
+
+function NDimsPlan(F::FTPlan, szs::Tuple, dims::Tuple)
+    NDimsPlan(ArrayPlan(F, szs, (first(dims),)), szs, dims)
+end
+
+function *(P::NDimsPlan, f::AbstractArray)
+    F, dims = P.F, P.dims
+    @assert size(P) == size(f)
+    g = copy(f)
+    t = 1:ndims(g)
+    d1 = dims[1]
+    for d in dims
+        perm = ntuple(k -> k == d1 ? t[d] : k == d ? t[d1] : t[k], ndims(g))
+        gp = permutedims(g, perm)
+        g = permutedims(F*gp, invperm(perm))
+    end
+    return g
+end
+
+function \(P::NDimsPlan, f::AbstractArray)
+    F, dims = P.F, P.dims
+    @assert size(P) == size(f)
+    g = copy(f)
+    t = 1:ndims(g)
+    d1 = dims[1]
+    for d in dims
+        perm = ntuple(k -> k == d1 ? t[d] : k == d ? t[d1] : t[k], ndims(g))
+        gp = permutedims(g, perm)
+        g = permutedims(F\gp, invperm(perm))
+    end
+    return g
+end
\ No newline at end of file
diff --git a/src/chebyshevtransform.jl b/src/chebyshevtransform.jl
index a705ca49..2d3f17d9 100644
--- a/src/chebyshevtransform.jl
+++ b/src/chebyshevtransform.jl
@@ -1,301 +1,619 @@
 ## Transforms take values at Chebyshev points of the first and second kinds and produce Chebyshev coefficients
 
+abstract type ChebyshevPlan{T} <: Plan{T} end
 
-struct ChebyshevTransformPlan{T,kind,inplace,P} <: Plan{T}
-    plan::P
+*(P::ChebyshevPlan{T}, x::AbstractArray{T}) where T = error("Plan applied to wrong size array")
+
+size(P::ChebyshevPlan) = isdefined(P, :plan) ? size(P.plan) : (0,)
+length(P::ChebyshevPlan) = isdefined(P, :plan) ? length(P.plan) : 0
+
+
+const FIRSTKIND = FFTW.REDFT10
+const SECONDKIND = FFTW.REDFT00
+
+struct ChebyshevTransformPlan{T,kind,K,inplace,N,R} <: ChebyshevPlan{T}
+    plan::FFTW.r2rFFTWPlan{T,K,inplace,N,R}
+    ChebyshevTransformPlan{T,kind,K,inplace,N,R}(plan) where {T,kind,K,inplace,N,R} = new{T,kind,K,inplace,N,R}(plan)
+    ChebyshevTransformPlan{T,kind,K,inplace,N,R}() where {T,kind,K,inplace,N,R} = new{T,kind,K,inplace,N,R}()
 end
 
-ChebyshevTransformPlan{k,inp}(plan) where {k,inp} =
-    ChebyshevTransformPlan{eltype(plan),k,inp,typeof(plan)}(plan)
+ChebyshevTransformPlan{T,kind}(plan::FFTW.r2rFFTWPlan{T,K,inplace,N,R}) where {T,kind,K,inplace,N,R} =
+    ChebyshevTransformPlan{T,kind,K,inplace,N,R}(plan)
+
+# jump through some hoops to make inferrable
+
+function plan_chebyshevtransform!(x::AbstractArray{T,N}, ::Val{1}, dims...; kws...) where {T<:fftwNumber,N}
+    if isempty(x)
+        ChebyshevTransformPlan{T,1,Vector{Int32},true,N,isempty(dims) ? NTuple{N,Int} : typeof(dims[1])}()
+    else
+        ChebyshevTransformPlan{T,1}(FFTW.plan_r2r!(x, FIRSTKIND, dims...; kws...))
+    end
+end
+function plan_chebyshevtransform!(x::AbstractArray{T,N}, ::Val{2}, dims...; kws...) where {T<:fftwNumber,N}
+    any(≤(1),size(x)) && throw(ArgumentError("Array must contain at least 2 entries"))
+    ChebyshevTransformPlan{T,2}(FFTW.plan_r2r!(x, SECONDKIND, dims...; kws...))
+end
+
+
+function plan_chebyshevtransform(x::AbstractArray{T,N}, ::Val{1}, dims...; kws...) where {T<:fftwNumber,N}
+    if isempty(x)
+        ChebyshevTransformPlan{T,1,Vector{Int32},false,N,isempty(dims) ? NTuple{N,Int} : typeof(dims[1])}()
+    else
+        ChebyshevTransformPlan{T,1}(FFTW.plan_r2r(x, FIRSTKIND, dims...; kws...))
+    end
+end
+function plan_chebyshevtransform(x::AbstractArray{T,N}, ::Val{2}, dims...; kws...) where {T<:fftwNumber,N}
+    any(≤(1),size(x)) && throw(ArgumentError("Array must contain at least 2 entries"))
+    ChebyshevTransformPlan{T,2}(FFTW.plan_r2r(x, SECONDKIND, dims...; kws...))
+end
 
 
+# convert x if necessary
+_maybemutablecopy(x::StridedArray{T}, ::Type{T}) where {T} = x
+_maybemutablecopy(x, T) = Array{T}(x)
+@inline _plan_mul!(y::AbstractArray{T}, P::Plan{T}, x::AbstractArray) where T = mul!(y, P, _maybemutablecopy(x, T))
 
-function plan_chebyshevtransform!(x::AbstractVector{T}; kind::Integer=1) where T<:fftwNumber
-    if kind == 1
-        plan = isempty(x) ? fill(one(T),1,length(x)) : FFTW.plan_r2r!(x, FFTW.REDFT10)
-        ChebyshevTransformPlan{1,true}(plan)
-    elseif kind == 2
-        plan = length(x) ≤ 1 ? fill(one(T),1,length(x)) : FFTW.plan_r2r!(x, FFTW.REDFT00)
-        ChebyshevTransformPlan{2,true}(plan)
+function applydim!(op!, X::AbstractArray, Rpre, Rpost, ind)
+    for Ipost in Rpost, Ipre in Rpre
+        v = view(X, Ipre, ind, Ipost)
+        op!(v)
     end
+    X
+end
+function applydim!(op!, X::AbstractArray, d::Integer, ind)
+    Rpre = CartesianIndices(axes(X)[1:d-1])
+    Rpost = CartesianIndices(axes(X)[d+1:end])
+    applydim!(op!, X, Rpre, Rpost, ind)
 end
 
-function plan_chebyshevtransform(x::AbstractVector{T};kind::Integer=1) where T<:fftwNumber
-    plan = plan_chebyshevtransform!(x;kind=kind)
-    ChebyshevTransformPlan{kind,false}(plan)
+for op in (:ldiv, :lmul)
+    op_dim_begin! = Symbol(op, :_dim_begin!)
+    op_dim_end! = Symbol(op, :_dim_end!)
+    op! = Symbol(op, :!)
+    @eval begin
+        function $op_dim_begin!(α, d::Number, y::AbstractArray)
+            # scale just the d-th dimension by permuting it to the first
+            d ∈ 1:ndims(y) || throw(ArgumentError("dimension $d must lie between 1 and $(ndims(y))"))
+            applydim!(v -> $op!(α, v), y, d, 1)
+        end
+
+        function $op_dim_end!(α, d::Number, y::AbstractArray)
+            # scale just the d-th dimension by permuting it to the first
+            d ∈ 1:ndims(y) || throw(ArgumentError("dimension $d must lie between 1 and $(ndims(y))"))
+            applydim!(v -> $op!(α, v), y, d, size(y, d))
+        end
+    end
 end
 
-function *(P::ChebyshevTransformPlan{T,1,true},x::AbstractVector{T}) where T
-    n = length(x)
-    n ≤ 1 && return x
 
-    x = P.plan*x
-    x[1] /= 2
-    lmul!(inv(convert(T,n)), x)
+@inline function _cheb1_rescale!(d::Number, y::AbstractArray)
+    ldiv_dim_begin!(2, d, y)
+    ldiv!(size(y,d), y)
+end
+
+function _prod_size(sz, d)
+    ret = 1
+    for k in d
+        ret *= sz[k]
+    end
+    ret
 end
 
-function *(P::ChebyshevTransformPlan{T,2,true}, x::AbstractVector{T}) where T
+
+@inline function _cheb1_rescale!(d, y::AbstractArray)
+    for k in d
+        ldiv_dim_begin!(2, k, y)
+    end
+    ldiv!(_prod_size(size(y), d), y)
+end
+
+
+
+function *(P::ChebyshevTransformPlan{T,1,K,true,N}, x::AbstractArray{T,N}) where {T,K,N}
+    isempty(x) && return x
+
+    y = P.plan*x # will be  === x if in-place
+    _cheb1_rescale!(P.plan.region, y)
+end
+
+function mul!(y::AbstractArray{T,N}, P::ChebyshevTransformPlan{T,1,K,false,N}, x::AbstractArray{<:Any,N}) where {T,K,N}
+    size(y) == size(x) || throw(DimensionMismatch("output must match dimension"))
+    isempty(x) && return y
+    _plan_mul!(y, P.plan, x)
+    _cheb1_rescale!(P.plan.region, y)
+end
+
+
+
+function _cheb2_rescale!(d::Number, y::AbstractArray)
+    ldiv_dim_begin!(2, d, y)
+    ldiv_dim_end!(2, d, y)
+    ldiv!(size(y,d)-1, y)
+end
+
+# TODO: higher dimensional arrays
+function _cheb2_rescale!(d, y::AbstractArray)
+    for k in d
+        ldiv_dim_begin!(2, k, y)
+        ldiv_dim_end!(2, k, y)
+    end
+
+    ldiv!(_prod_size(size(y) .- 1, d), y)
+end
+
+function *(P::ChebyshevTransformPlan{T,2,K,true,N}, x::AbstractArray{T,N}) where {T,K,N}
     n = length(x)
-    n ≤ 1 && return x
+    y = P.plan*x # will be  === x if in-place
+    _cheb2_rescale!(P.plan.region, y)
+end
 
-    x = P.plan*x
-    x[1] /= 2; x[end] /= 2
-    lmul!(inv(convert(T,n-1)),x)
+function mul!(y::AbstractArray{T,N}, P::ChebyshevTransformPlan{T,2,K,false,N}, x::AbstractArray{<:Any,N}) where {T,K,N}
+    n = length(x)
+    length(y) == n || throw(DimensionMismatch("output must match dimension"))
+    _plan_mul!(y, P.plan, x)
+    _cheb2_rescale!(P.plan.region, y)
 end
 
-chebyshevtransform!(x::AbstractVector{T};kind::Integer=1) where {T<:fftwNumber} =
-    plan_chebyshevtransform!(x;kind=kind)*x
+*(P::ChebyshevTransformPlan{T,kind,K,false,N}, x::AbstractArray{T,N}) where {T,kind,K,N} =
+    mul!(similar(x), P, x)
+
+"""
+    chebyshevtransform!(x, kind=Val(1))
 
-chebyshevtransform(x;kind::Integer=1) = chebyshevtransform!(copy(x);kind=kind)
+transforms from values on a Chebyshev grid of the first or second kind to Chebyshev
+coefficients, in-place
+"""
+chebyshevtransform!(x, dims...; kws...) = plan_chebyshevtransform!(x, dims...; kws...)*x
 
-*(P::ChebyshevTransformPlan{T,k,false}, x::AbstractVector{T}) where {T,k} = P.plan*copy(x)
 
+"""
+    chebyshevtransform(x, kind=Val(1))
 
+transforms from values on a Chebyshev grid of the first or second kind to Chebyshev
+coefficients.
+"""
+chebyshevtransform(x, dims...; kws...) = plan_chebyshevtransform(x, dims...; kws...) * x
 
 
 ## Inverse transforms take Chebyshev coefficients and produce values at Chebyshev points of the first and second kinds
 
 
-struct IChebyshevTransformPlan{T,kind,inplace,P}
-    plan::P
+const IFIRSTKIND = FFTW.REDFT01
+
+struct IChebyshevTransformPlan{T,kind,K,inplace,N,R} <: ChebyshevPlan{T}
+    plan::FFTW.r2rFFTWPlan{T,K,inplace,N,R}
+    IChebyshevTransformPlan{T,kind,K,inplace,N,R}(plan) where {T,kind,K,inplace,N,R} = new{T,kind,K,inplace,N,R}(plan)
+    IChebyshevTransformPlan{T,kind,K,inplace,N,R}() where {T,kind,K,inplace,N,R} = new{T,kind,K,inplace,N,R}()
 end
 
+IChebyshevTransformPlan{T,kind}(F::FFTW.r2rFFTWPlan{T,K,inplace,N,R}) where {T,kind,K,inplace,N,R} =
+    IChebyshevTransformPlan{T,kind,K,inplace,N,R}(F)
+
+
+
 # second kind Chebyshev transforms share a plan with their inverse
 # so we support this via inv
-inv(P::ChebyshevTransformPlan{T,2,true}) where T = IChebyshevTransformPlan{T,2,true,typeof(P)}(P)
-inv(P::IChebyshevTransformPlan{T,2,true}) where T = P.plan
+inv(P::ChebyshevTransformPlan{T,2}) where {T} = IChebyshevTransformPlan{T,2}(P.plan)
+inv(P::IChebyshevTransformPlan{T,2}) where {T} = ChebyshevTransformPlan{T,2}(P.plan)
+
+inv(P::ChebyshevTransformPlan{T,1}) where {T} = IChebyshevTransformPlan{T,1}(inv(P.plan).p)
+inv(P::IChebyshevTransformPlan{T,1}) where {T} = ChebyshevTransformPlan{T,1}(inv(P.plan).p)
+
+
 
 \(P::ChebyshevTransformPlan, x::AbstractArray) = inv(P) * x
 \(P::IChebyshevTransformPlan, x::AbstractArray) = inv(P) * x
 
 
-function plan_ichebyshevtransform!(x::AbstractVector{T};kind::Integer=1) where T<:fftwNumber
-    if kind == 1
-        plan = isempty(x) ? fill(one(T),1,length(x)) : FFTW.plan_r2r!(x, FFTW.REDFT01)
-        IChebyshevTransformPlan{T,1,true,typeof(plan)}(plan)
-    elseif kind == 2
-        inv(plan_chebyshevtransform!(x;kind=2))
+function plan_ichebyshevtransform!(x::AbstractArray{T,N}, ::Val{1}, dims...; kws...) where {T<:fftwNumber,N}
+    if isempty(x)
+        IChebyshevTransformPlan{T,1,Vector{Int32},true,N,isempty(dims) ? NTuple{N,Int} : typeof(dims[1])}()
+    else
+        IChebyshevTransformPlan{T,1}(FFTW.plan_r2r!(x, IFIRSTKIND, dims...; kws...))
     end
 end
 
-function plan_ichebyshevtransform(x::AbstractVector{T};kind::Integer=1) where T<:fftwNumber
-    plan = plan_ichebyshevtransform!(similar(Vector{T},axes(x));kind=kind)
-    IChebyshevTransformPlan{T,kind,false,typeof(plan)}(plan)
+function plan_ichebyshevtransform!(x::AbstractArray{T}, ::Val{2}, dims...; kws...) where T<:fftwNumber
+    inv(plan_chebyshevtransform!(x, Val(2), dims...; kws...))
 end
 
-function *(P::IChebyshevTransformPlan{T,1,true},x::AbstractVector{T}) where T<:fftwNumber
-    isempty(x) && return x
-    x[1] *=2
-    x = lmul!(convert(T,0.5), P.plan*x)
+function plan_ichebyshevtransform(x::AbstractArray{T,N}, ::Val{1}, dims...; kws...) where {T<:fftwNumber,N}
+    if isempty(x)
+        IChebyshevTransformPlan{T,1,Vector{Int32},false,N,isempty(dims) ? NTuple{N,Int} : typeof(dims[1])}()
+    else
+        IChebyshevTransformPlan{T,1}(FFTW.plan_r2r(x, IFIRSTKIND, dims...; kws...))
+    end
+end
+
+function plan_ichebyshevtransform(x::AbstractArray{T}, ::Val{2}, dims...; kws...) where T<:fftwNumber
+    inv(plan_chebyshevtransform(x, Val(2), dims...; kws...))
+end
+
+@inline function _icheb1_prescale!(d::Number, x::AbstractArray)
+    lmul_dim_begin!(2, d, x)
+    x
+end
+@inline function _icheb1_prescale!(d, x::AbstractArray)
+    for k in d
+        _icheb1_prescale!(k, x)
+    end
+    x
+end
+@inline function _icheb1_postscale!(d::Number, x::AbstractArray)
+    ldiv_dim_begin!(2, d, x)
     x
 end
 
-function *(P::IChebyshevTransformPlan{T,2,true},x::AbstractVector{T}) where T<:fftwNumber
+@inline function _icheb1_postscale!(d, x::AbstractArray)
+    for k in d
+        _icheb1_postscale!(k, x)
+    end
+    x
+end
+
+function *(P::IChebyshevTransformPlan{T,1,K,true,N}, x::AbstractArray{T,N}) where {T<:fftwNumber,K,N}
     n = length(x)
-    n ≤ 1 && return x
-    x[1] *= 2; x[end] *= 2
-    x = P.plan*x
-    x[1] *= 2; x[end] *= 2
-    lmul!(convert(T,0.5(n-1)),x)
+    n == 0 && return x
+
+    _icheb1_prescale!(P.plan.region, x)
+    x = ldiv!(2^length(P.plan.region), P.plan*x)
+    x
 end
 
-ichebyshevtransform!(x::AbstractVector{T};kind::Integer=1) where {T<:fftwNumber} =
-    plan_ichebyshevtransform!(x;kind=kind)*x
+function mul!(y::AbstractArray{T,N}, P::IChebyshevTransformPlan{T,1,K,false,N}, x::AbstractArray{T,N}) where {T<:fftwNumber,K,N}
+    size(y) == size(x) || throw(DimensionMismatch("output must match dimension"))
+    isempty(x) && return y
 
-ichebyshevtransform(x;kind::Integer=1) = ichebyshevtransform!(copy(x); kind=kind)
+    _icheb1_prescale!(P.plan.region, x) # TODO: don't mutate x
+    _plan_mul!(y, P.plan, x)
+    _icheb1_postscale!(P.plan.region, x)
+    ldiv!(2^length(P.plan.region), y)
+end
 
-*(P::IChebyshevTransformPlan{T,k,false},x::AbstractVector{T}) where {T,k} = P.plan*copy(x)
+@inline function _icheb2_prescale!(d::Number, x::AbstractArray)
+    lmul_dim_begin!(2, d, x)
+    lmul_dim_end!(2, d, x)
+    x
+end
+@inline function _icheb2_prescale!(d, x::AbstractArray)
+    for k in d
+        _icheb2_prescale!(k, x)
+    end
+    x
+end
 
-## Code generation for integer inputs
+@inline function _icheb2_postrescale!(d::Number, x::AbstractArray)
+    ldiv_dim_begin!(2, d, x)
+    ldiv_dim_end!(2, d, x)
+    x
+end
+@inline function _icheb2_postrescale!(d, x::AbstractArray)
+    for k in d
+        _icheb2_postrescale!(k, x)
+    end
+    x
+end
+@inline function _icheb2_rescale!(d::Number, y::AbstractArray{T}) where T
+    _icheb2_prescale!(d, y)
+    lmul!(convert(T, size(y,d) - 1)/2, y)
+    y
+end
+@inline function _icheb2_rescale!(d, y::AbstractArray{T}) where T
+    _icheb2_prescale!(d, y)
+    lmul!(_prod_size(convert.(T, size(y) .- 1)./2, d), y)
+    y
+end
+
+function *(P::IChebyshevTransformPlan{T,2,K,true,N}, x::AbstractArray{T,N}) where {T<:fftwNumber,K,N}
+    n = length(x)
 
-for func in (:chebyshevtransform,:ichebyshevtransform)
-    @eval $func(x::AbstractVector{T};kind::Integer=1) where {T<:Integer} = $func(convert(Float64,x);kind=kind)
+    _icheb2_prescale!(P.plan.region, x)
+    x = inv(P)*x
+    _icheb2_rescale!(P.plan.region, x)
 end
 
+function mul!(y::AbstractArray{T,N}, P::IChebyshevTransformPlan{T,2,K,false,N}, x::AbstractArray{<:Any,N}) where {T<:fftwNumber,K,N}
+    n = length(x)
+    length(y) == n || throw(DimensionMismatch("output must match dimension"))
 
-# Matrix inputs
-#
-#
-function chebyshevtransform!(X::AbstractMatrix{T}; kind::Integer=1) where T<:fftwNumber
-    if kind == 1
-        if size(X) == (1,1)
-            X
-        else
-            X=FFTW.r2r!(X,FFTW.REDFT10)
-            X[:,1]/=2;X[1,:]/=2;
-            lmul!(1/(size(X,1)*size(X,2)),X)
-        end
-    elseif kind == 2
-        if size(X) == (1,1)
-            X
-        else
-            X=FFTW.r2r!(X,FFTW.REDFT00)
-            lmul!(1/((size(X,1)-1)*(size(X,2)-1)),X)
-            X[:,1]/=2;X[:,end]/=2
-            X[1,:]/=2;X[end,:]/=2
-            X
+    _icheb2_prescale!(P.plan.region, x)
+    _plan_mul!(y, inv(P), x)
+    _icheb2_postrescale!(P.plan.region, x)
+    _icheb2_rescale!(P.plan.region, y)
+end
+
+*(P::IChebyshevTransformPlan{T,kind,K,false,N}, x::AbstractArray{T,N}) where {T,kind,K,N} =
+    mul!(similar(x), P, _maybemutablecopy(x, T))
+ichebyshevtransform!(x::AbstractArray, dims...; kwds...) = plan_ichebyshevtransform!(x, dims...; kwds...)*x
+ichebyshevtransform(x, dims...; kwds...) = plan_ichebyshevtransform(x, dims...; kwds...)*x
+
+
+#######
+# Chebyshev U
+#######
+
+const UFIRSTKIND = FFTW.RODFT10
+const USECONDKIND = FFTW.RODFT00
+
+struct ChebyshevUTransformPlan{T,kind,K,inplace,N,R} <: ChebyshevPlan{T}
+    plan::FFTW.r2rFFTWPlan{T,K,inplace,N,R}
+    ChebyshevUTransformPlan{T,kind,K,inplace,N,R}(plan) where {T,kind,K,inplace,N,R} = new{T,kind,K,inplace,N,R}(plan)
+    ChebyshevUTransformPlan{T,kind,K,inplace,N,R}() where {T,kind,K,inplace,N,R} = new{T,kind,K,inplace,N,R}()
+end
+
+ChebyshevUTransformPlan{T,kind}(plan::FFTW.r2rFFTWPlan{T,K,inplace,N,R}) where {T,kind,K,inplace,N,R} =
+    ChebyshevUTransformPlan{T,kind,K,inplace,N,R}(plan)
+
+
+function plan_chebyshevutransform!(x::AbstractArray{T,N}, ::Val{1}, dims...; kws...) where {T<:fftwNumber,N}
+    if isempty(x)
+        ChebyshevUTransformPlan{T,1,Vector{Int32},true,N,isempty(dims) ? NTuple{N,Int} : typeof(dims[1])}()
+    else
+        ChebyshevUTransformPlan{T,1}(FFTW.plan_r2r!(x, UFIRSTKIND, dims...; kws...))
+    end
+end
+function plan_chebyshevutransform!(x::AbstractArray{T,N}, ::Val{2}, dims...; kws...) where {T<:fftwNumber,N}
+    any(≤(1),size(x)) && throw(ArgumentError("Array must contain at least 2 entries"))
+    ChebyshevUTransformPlan{T,2}(FFTW.plan_r2r!(x, USECONDKIND, dims...; kws...))
+end
+
+function plan_chebyshevutransform(x::AbstractArray{T,N}, ::Val{1}, dims...; kws...) where {T<:fftwNumber,N}
+    if isempty(x)
+        ChebyshevUTransformPlan{T,1,Vector{Int32},false,N,isempty(dims) ? NTuple{N,Int} : typeof(dims[1])}()
+    else
+        ChebyshevUTransformPlan{T,1}(FFTW.plan_r2r(x, UFIRSTKIND, dims...; kws...))
+    end
+end
+function plan_chebyshevutransform(x::AbstractArray{T,N}, ::Val{2}, dims...; kws...) where {T<:fftwNumber,N}
+    if isempty(dims)
+        any(≤(1), size(x)) && throw(ArgumentError("Array must contain at least 2 entries"))
+    else
+        for d in dims[1]
+            size(x,d) ≤ 1 && throw(ArgumentError("Array must contain at least 2 entries"))
         end
     end
+    ChebyshevUTransformPlan{T,2}(FFTW.plan_r2r(x, USECONDKIND, dims...; kws...))
 end
-#
-function ichebyshevtransform!(X::AbstractMatrix{T}; kind::Integer=1) where T<:fftwNumber
-    if kind == 1
-        if size(X) == (1,1)
+
+for f in [:_chebu1_prescale!, :_chebu1_postscale!, :_chebu2_prescale!, :_chebu2_postscale!,
+            :_ichebu1_postscale!]
+    _f = Symbol(:_, f)
+    @eval begin
+        @inline function $f(d::Number, X::AbstractArray)
+            d ∈ 1:ndims(X) || throw("dimension $d must lie between 1 and $(ndims(X))")
+            $_f(d, X)
             X
-        else
-            X[1,:]*=2;X[:,1]*=2
-            X = FFTW.r2r(X,FFTW.REDFT01)
-            lmul!(0.25, X)
         end
-    elseif kind == 2
-        if size(X) == (1,1)
-            X
-        else
-            X[1,:]*=2;X[end,:]*=2;X[:,1]*=2;X[:,end]*=2
-            X=chebyshevtransform!(X;kind=kind)
-            X[1,:]*=2;X[end,:]*=2;X[:,1]*=2;X[:,end]*=2
-            lmul!((size(X,1)-1)*(size(X,2)-1)/4,X)
+        @inline function $f(d, y::AbstractArray)
+            for k in d
+                $f(k, y)
+            end
+            y
         end
     end
 end
-#
-
 
-## Chebyshev U
-
-struct ChebyshevUTransformPlan{T,kind,inplace,P} <: Plan{T}
-    plan::P
+function __chebu1_prescale!(d::Number, X::AbstractArray{T}) where {T}
+    m = size(X,d)
+    r = one(T)/(2m) .+ ((1:m) .- one(T))./m
+    applydim!(v -> v .*= sinpi.(r) ./ m, X, d, :)
 end
 
-ChebyshevUTransformPlan{k,inp}(plan) where {k,inp} =
-    ChebyshevUTransformPlan{eltype(plan),k,inp,typeof(plan)}(plan)
-
+@inline function __chebu1_postscale!(d::Number, X::AbstractArray{T}) where {T}
+    m = size(X,d)
+    r = one(T)/(2m) .+ ((1:m) .- one(T))./m
+    applydim!(v -> v ./= sinpi.(r) ./ m, X, d, :)
+end
 
+function *(P::ChebyshevUTransformPlan{T,1,K,true,N}, x::AbstractArray{T,N}) where {T,K,N}
+    length(x) ≤ 1 && return x
+    _chebu1_prescale!(P.plan.region, x)
+    P.plan * x
+end
 
-function plan_chebyshevutransform!(x::AbstractVector{T}; kind::Integer=1) where T<:fftwNumber
-    if kind == 1
-        plan = isempty(x) ? fill(one(T),1,length(x)) : FFTW.plan_r2r!(x, FFTW.RODFT10)
-        ChebyshevUTransformPlan{1,true}(plan)
-    elseif kind == 2
-        plan = length(x) ≤ 1 ? fill(one(T),1,length(x)) : FFTW.plan_r2r!(x, FFTW.RODFT00)
-        ChebyshevUTransformPlan{2,true}(plan)
+function mul!(y::AbstractArray{T}, P::ChebyshevUTransformPlan{T,1,K,false}, x::AbstractArray{T}) where {T,K}
+    size(y) == size(x) || throw(DimensionMismatch("output must match dimension"))
+    isempty(x) && return y
+    _chebu1_prescale!(P.plan.region, x) # Todo don't mutate x
+    _plan_mul!(y, P.plan, x)
+    _chebu1_postscale!(P.plan.region, x)
+    for d in P.plan.region
+        size(y,d) == 1 && ldiv!(2, y) # fix doubling
     end
+    y
 end
 
-function plan_chebyshevutransform(x::AbstractVector{T};kind::Integer=1) where T<:fftwNumber
-    plan = plan_chebyshevutransform!(x;kind=kind)
-    ChebyshevUTransformPlan{kind,false}(plan)
+
+@inline function __chebu2_prescale!(d, X::AbstractArray{T}) where {T}
+    m = size(X,d)
+    c = one(T)/ (m+1)
+    r = (1:m) .* c
+    applydim!(v -> v .*= sinpi.(r), X, d, :)
 end
 
-function *(P::ChebyshevUTransformPlan{T,1,true},x::AbstractVector{T}) where T
-    n = length(x)
-    n ≤ 1 && return x
+@inline function __chebu2_postscale!(d::Number, X::AbstractArray{T}) where {T}
+    m = size(X,d)
+    c = one(T)/ (m+1)
+    r = (1:m) .* c
+    applydim!(v -> v ./= sinpi.(r), X, d, :)
+end
 
-    for k=1:n # sqrt(1-x_j^2) weight
-        x[k] *= sinpi(one(T)/(2n) + (k-one(T))/n)/n
+function *(P::ChebyshevUTransformPlan{T,2,K,true,N}, x::AbstractArray{T,N}) where {T,K,N}
+    sc = one(T)
+    for d in P.plan.region
+        sc *= one(T)/(size(x,d)+1)
     end
-    P.plan * x
+    _chebu2_prescale!(P.plan.region, x)
+    lmul!(sc, P.plan * x)
 end
 
-function *(P::ChebyshevUTransformPlan{T,2,true},x::AbstractVector{T}) where T
-    n = length(x)
-    n ≤ 1 && return x
-
-    c = one(T)/ (n+1)
-    for k=1:n # sqrt(1-x_j^2) weight
-        x[k] *= sinpi(k*c)
+function mul!(y::AbstractArray{T}, P::ChebyshevUTransformPlan{T,2,K,false}, x::AbstractArray{T}) where {T,K}
+    sc = one(T)
+    for d in P.plan.region
+        sc *= one(T)/(size(x,d)+1)
     end
-    lmul!(c, P.plan * x)
+    _chebu2_prescale!(P.plan.region, x) # TODO don't mutate x
+    _plan_mul!(y, P.plan, x)
+    _chebu2_postscale!(P.plan.region, x)
+    lmul!(sc, y)
 end
 
-chebyshevutransform!(x::AbstractVector{T};kind::Integer=1) where {T<:fftwNumber} =
-    plan_chebyshevutransform!(x;kind=kind)*x
+*(P::ChebyshevUTransformPlan{T,kind,K,false,N}, x::AbstractArray{T,N}) where {T,kind,K,N} =
+    mul!(similar(x), P, x)
 
-chebyshevutransform(x;kind::Integer=1) = chebyshevutransform!(copy(x);kind=kind)
+chebyshevutransform!(x::AbstractArray{T}, dims...; kws...) where {T<:fftwNumber} =
+    plan_chebyshevutransform!(x, dims...; kws...)*x
 
-*(P::ChebyshevUTransformPlan{T,k,false},x::AbstractVector{T}) where {T,k} = P.plan*copy(x)
 
-## Inverse transforms take ChebyshevU coefficients and produce values at ChebyshevU points of the first and second kinds
+"""
+    chebyshevutransform(x, ::Val{kind}=Val(1))
+
+transforms from values on a Chebyshev grid of the first or second kind to Chebyshev
+coefficients of the 2nd kind (Chebyshev U expansion).
+"""
+chebyshevutransform(x, dims...; kws...) = plan_chebyshevutransform(x, dims...; kws...)*x
 
 
-struct IChebyshevUTransformPlan{T,kind,inplace,P}
-    plan::P
+## Inverse transforms take ChebyshevU coefficients and produce values at ChebyshevU points of the first and second kinds
+const IUFIRSTKIND = FFTW.RODFT01
+
+struct IChebyshevUTransformPlan{T,kind,K,inplace,N,R} <: ChebyshevPlan{T}
+    plan::FFTW.r2rFFTWPlan{T,K,inplace,N,R}
+    IChebyshevUTransformPlan{T,kind,K,inplace,N,R}(plan) where {T,kind,K,inplace,N,R} = new{T,kind,K,inplace,N,R}(plan)
+    IChebyshevUTransformPlan{T,kind,K,inplace,N,R}() where {T,kind,K,inplace,N,R} = new{T,kind,K,inplace,N,R}()
 end
 
+IChebyshevUTransformPlan{T,kind}(F::FFTW.r2rFFTWPlan{T,K,inplace,N,R}) where {T,kind,K,inplace,N,R} =
+    IChebyshevUTransformPlan{T,kind,K,inplace,N,R}(F)
 
-function plan_ichebyshevutransform!(x::AbstractVector{T};kind::Integer=1) where T<:fftwNumber
-    if kind == 1
-        plan = isempty(x) ? fill(one(T),1,length(x)) : FFTW.plan_r2r!(x, FFTW.RODFT01)
-        IChebyshevUTransformPlan{T,1,true,typeof(plan)}(plan)
-    elseif kind == 2
-        plan = length(x) ≤ 1 ? fill(one(T),1,length(x)) : FFTW.plan_r2r!(x, FFTW.RODFT00)
-        IChebyshevUTransformPlan{T,2,true,typeof(plan)}(plan)
+function plan_ichebyshevutransform!(x::AbstractArray{T,N}, ::Val{1}, dims...; kws...) where {T<:fftwNumber,N}
+    if isempty(x)
+        IChebyshevUTransformPlan{T,1,Vector{Int32},true,N,isempty(dims) ? NTuple{N,Int} : typeof(dims[1])}()
+    else
+        IChebyshevUTransformPlan{T,1}(FFTW.plan_r2r!(x, IUFIRSTKIND, dims...; kws...))
     end
 end
+function plan_ichebyshevutransform!(x::AbstractArray{T,N}, ::Val{2}, dims...; kws...) where {T<:fftwNumber,N}
+    any(≤(1),size(x)) && throw(ArgumentError("Array must contain at least 2 entries"))
+    IChebyshevUTransformPlan{T,2}(FFTW.plan_r2r!(x, USECONDKIND, dims...))
+end
 
-function plan_ichebyshevutransform(x::AbstractVector{T}; kind::Integer=1) where T<:fftwNumber
-    plan = plan_ichebyshevutransform!(similar(Vector{T},axes(x)); kind=kind)
-    IChebyshevUTransformPlan{T,kind,false,typeof(plan)}(plan)
+function plan_ichebyshevutransform(x::AbstractArray{T,N}, ::Val{1}, dims...; kws...) where {T<:fftwNumber,N}
+    if isempty(x)
+        IChebyshevUTransformPlan{T,1,Vector{Int32},false,N,isempty(dims) ? NTuple{N,Int} : typeof(dims[1])}()
+    else
+        IChebyshevUTransformPlan{T,1}(FFTW.plan_r2r(x, IUFIRSTKIND, dims...; kws...))
+    end
+end
+function plan_ichebyshevutransform(x::AbstractArray{T,N}, ::Val{2}, dims...; kws...) where {T<:fftwNumber,N}
+    any(≤(1),size(x)) && throw(ArgumentError("Array must contain at least 2 entries"))
+    IChebyshevUTransformPlan{T,2}(FFTW.plan_r2r(x, USECONDKIND, dims...; kws...))
 end
 
-function *(P::IChebyshevUTransformPlan{T,1,true}, x::AbstractVector{T}) where T<:fftwNumber
-    n = length(x)
-    n ≤ 1 && return x
 
+# second kind Chebyshev transforms share a plan with their inverse
+# so we support this via inv
+inv(P::ChebyshevUTransformPlan{T,2}) where {T} = IChebyshevUTransformPlan{T,2}(P.plan)
+inv(P::IChebyshevUTransformPlan{T,2}) where {T} = ChebyshevUTransformPlan{T,2}(P.plan)
+
+inv(P::ChebyshevUTransformPlan{T,1}) where {T} = IChebyshevUTransformPlan{T,1}(inv(P.plan).p)
+inv(P::IChebyshevUTransformPlan{T,1}) where {T} = ChebyshevUTransformPlan{T,1}(inv(P.plan).p)
+
+@inline function __ichebu1_postscale!(d::Number, X::AbstractArray{T}) where {T}
+    m = size(X,d)
+    r = one(T)/(2m) .+ ((1:m) .- one(T))/m
+    applydim!(v -> v ./= 2 .* sinpi.(r), X, d, :)
+end
+
+function *(P::IChebyshevUTransformPlan{T,1,K,true}, x::AbstractArray{T}) where {T<:fftwNumber,K}
+    length(x) ≤ 1 && return x
     x = P.plan * x
-    for k=1:n # sqrt(1-x_j^2) weight
-        x[k] /= 2sinpi(one(T)/(2n) + (k-one(T))/n)
+    _ichebu1_postscale!(P.plan.region, x)
+end
+
+function mul!(y::AbstractArray{T}, P::IChebyshevUTransformPlan{T,1,K,false}, x::AbstractArray{T}) where {T<:fftwNumber,K}
+    size(y) == size(x) || throw(DimensionMismatch("output must match dimension"))
+    isempty(x) && return y
+    _plan_mul!(y, P.plan, x)
+    _ichebu1_postscale!(P.plan.region, y)
+    for d in P.plan.region
+        size(y,d) == 1 && lmul!(2, y) # fix doubling
     end
-    x
+    y
 end
 
+function _ichebu2_rescale!(d::Number, x::AbstractArray{T}) where T
+    _chebu2_postscale!(d, x)
+    ldiv!(2, x)
+    x
+end
 
+@inline function _ichebu2_rescale!(d, y::AbstractArray)
+    for k in d
+        _ichebu2_rescale!(k, y)
+    end
+    y
+end
 
-function *(P::IChebyshevUTransformPlan{T,2,true}, x::AbstractVector{T}) where T<:fftwNumber
+function *(P::IChebyshevUTransformPlan{T,2,K,true}, x::AbstractArray{T}) where {T<:fftwNumber,K}
     n = length(x)
-    n ≤ 1 && return x
+    n ≤ 1 && return x
 
-    c = one(T)/ (n+1)
-    lmul!((n+1)/(2n+2*one(T)), x)
     x = P.plan * x
-    for k=1:n # sqrt(1-x_j^2) weight
-        x[k] /= sinpi(k*c)
-    end
-    x
+    _ichebu2_rescale!(P.plan.region, x)
+end
+
+function mul!(y::AbstractArray{T}, P::IChebyshevUTransformPlan{T,2,K,false}, x::AbstractArray{T}) where {T<:fftwNumber,K}
+    size(y) == size(x) || throw(DimensionMismatch("output must match dimension"))
+    length(x) ≤ 1 && return x
+
+    _plan_mul!(y, P.plan, x)
+    _ichebu2_rescale!(P.plan.region, y)
 end
 
-ichebyshevutransform!(x::AbstractVector{T};kind::Integer=1) where {T<:fftwNumber} =
-    plan_ichebyshevutransform!(x;kind=kind)*x
+ichebyshevutransform!(x::AbstractArray{T}, dims...; kwds...) where {T<:fftwNumber} =
+    plan_ichebyshevutransform!(x, dims...; kwds...)*x
+
+ichebyshevutransform(x, dims...; kwds...) = plan_ichebyshevutransform(x, dims...; kwds...)*x
 
-ichebyshevutransform(x;kind::Integer=1) = ichebyshevutransform!(copy(x);kind=kind)
+*(P::IChebyshevUTransformPlan{T,k,K,false,N}, x::AbstractArray{T,N}) where {T,k,K,N} =
+    mul!(similar(x), P, x)
 
-*(P::IChebyshevUTransformPlan{T,k,false},x::AbstractVector{T}) where {T,k} = P.plan*copy(x)
 
 ## Code generation for integer inputs
 
-for func in (:chebyshevutransform,:ichebyshevutransform)
-    @eval $func(x::AbstractVector{T};kind::Integer=1) where {T<:Integer} = $func(convert(Float64,x);kind=kind)
+for func in (:chebyshevtransform,:ichebyshevtransform,:chebyshevutransform,:ichebyshevutransform)
+    @eval $func(x::AbstractVector{T}, dims...; kwds...) where {T<:Integer} = $func(convert(AbstractVector{float(T)},x), dims...; kwds...)
 end
 
 
 
-
 ## points
 
-function chebyshevpoints(::Type{T}, n::Integer; kind::Int=1) where T<:Number
-    if kind == 1
-        T[sinpi((n-2k-one(T))/2n) for k=0:n-1]
-    elseif kind == 2
-        if n == 1
-            zeros(T,1)
-        else
-	    T[sinpi((n-2k-one(T))/(2n-2)) for k=0:n-1]
-        end
-    else
-        throw(ArgumentError("kind $kind not a valid kind of Chebyshev points"))
+struct ChebyshevGrid{kind,T} <: AbstractVector{T}
+    n::Int
+    function ChebyshevGrid{1,T}(n::Int) where T
+        n ≥ 0 || throw(ArgumentError("Number of points must be nonnehative"))
+        new{1,T}(n)
+    end
+    function ChebyshevGrid{2,T}(n::Int) where T
+        n ≥ 2 || throw(ArgumentError("Number of points must be greater than 2"))
+        new{2,T}(n)
     end
 end
-chebyshevpoints(n::Integer; kind::Int=1) = chebyshevpoints(Float64, n; kind=kind)
+
+ChebyshevGrid{kind}(n::Integer) where kind = ChebyshevGrid{kind,Float64}(n)
+
+size(g::ChebyshevGrid) = (g.n,)
+getindex(g::ChebyshevGrid{1,T}, k::Integer) where T =
+    sinpi(convert(T,g.n-2k+1)/(2g.n))
+
+getindex(g::ChebyshevGrid{2,T}, k::Integer) where T =
+    sinpi(convert(T,g.n-2k+1)/(2g.n-2))
+
+chebyshevpoints(::Type{T}, n::Integer, ::Val{kind}) where {T<:Number,kind} = ChebyshevGrid{kind,T}(n)
+chebyshevpoints(::Type{T}, n::Integer) where T = chebyshevpoints(T, n, Val(1))
+chebyshevpoints(n::Integer, kind=Val(1)) = chebyshevpoints(Float64, n, kind)
 
 
 # sin(nθ) coefficients to values at Clenshaw-Curtis nodes except ±1
@@ -315,3 +633,98 @@ chebyshevpoints(n::Integer; kind::Int=1) = chebyshevpoints(Float64, n; kind=kind
 #     x = P.plan*x
 #     rmul!(x,half(T))
 # end
+
+
+###
+# BigFloat
+# Use `Nothing` and fall back to FFT
+###
+
+
+plan_chebyshevtransform(x::AbstractArray{T,N}, ::Val{kind}, dims...; kws...) where {T,N,kind} =
+    ChebyshevTransformPlan{T,kind,Nothing,false,N,UnitRange{Int}}()
+plan_ichebyshevtransform(x::AbstractArray{T,N}, ::Val{kind}, dims...; kws...) where {T,N,kind} =
+    IChebyshevTransformPlan{T,kind,Nothing,false,N,UnitRange{Int}}()
+
+plan_chebyshevtransform!(x::AbstractArray{T,N}, ::Val{kind}, dims...; kws...) where {T,N,kind} =
+    ChebyshevTransformPlan{T,kind,Nothing,true,N,UnitRange{Int}}()
+plan_ichebyshevtransform!(x::AbstractArray{T,N}, ::Val{kind}, dims...; kws...) where {T,N,kind} =
+    IChebyshevTransformPlan{T,kind,Nothing,true,N,UnitRange{Int}}()
+
+
+#following Chebfun's @Chebtech1/vals2coeffs.m and @Chebtech2/vals2coeffs.m
+function *(P::ChebyshevTransformPlan{T,1,Nothing,false}, x::AbstractVector{T}) where T
+    n = length(x)
+    if n == 1
+        x
+    else
+        w = [2exp(im*convert(T,π)*k/2n) for k=0:n-1]
+        ret = w.*ifft([x;reverse(x)])[1:n]
+        ret = T<:Real ? real(ret) : ret
+        ret[1] /= 2
+        ret
+    end
+end
+
+
+# function *(P::ChebyshevTransformPlan{T,1,K,Nothing,false}, x::AbstractVector{T}) where {T,K}
+#     n = length(x)
+#     if n == 1
+#         x
+#     else
+#         ret = ifft([x;x[end:-1:2]])[1:n]
+#         ret = T<:Real ? real(ret) : ret
+#         ret[2:n-1] *= 2
+#         ret
+#     end
+# end
+
+
+*(P::ChebyshevTransformPlan{T,1,Nothing,true,N,R}, x::AbstractVector{T}) where {T,N,R} =
+    copyto!(x, ChebyshevTransformPlan{T,1,Nothing,false,N,R}() * x)
+# *(P::ChebyshevTransformPlan{T,2,true,Nothing}, x::AbstractVector{T}) where T =
+#     copyto!(x, ChebyshevTransformPlan{T,2,false,Nothing}() * x)
+
+
+#following Chebfun's @Chebtech1/vals2coeffs.m and @Chebtech2/vals2coeffs.m
+function *(P::IChebyshevTransformPlan{T,1,Nothing,false}, x::AbstractVector{T}) where T
+    n = length(x)
+    if n == 1
+        x
+    else
+        w = [exp(-im*convert(T,π)*k/2n)/2 for k=0:2n-1]
+        w[1] *= 2;w[n+1] *= 0;w[n+2:end] *= -1
+        ret = fft(w.*[x;one(T);x[end:-1:2]])
+        ret = T<:Real ? real(ret) : ret
+        ret[1:n]
+    end
+end
+
+# function *(P::IChebyshevTransformPlan{T,2,K,Nothing,true}, x::AbstractVector{T}) where {T,K}
+#     n = length(x)
+#     if n == 1
+#         x
+#     else
+#         x[1] *= 2; x[end] *= 2
+#         chebyshevtransform!(x, Val(2))
+#         x[1] *= 2; x[end] *= 2
+#         lmul!(convert(T,n-1)/2, x)
+#         x
+#     end
+# end
+
+*(P::IChebyshevTransformPlan{T,1,Nothing,true,N,R}, x::AbstractVector{T}) where {T,N,R} =
+    copyto!(x, IChebyshevTransformPlan{T,1,Nothing,false,N,R}() * x)
+# *(P::IChebyshevTransformPlan{T,SECONDKIND,false,Nothing}, x::AbstractVector{T}) where T =
+#     IChebyshevTransformPlan{T,SECONDKIND,true,Nothing}() * copy(x)
+
+
+for pln in (:plan_chebyshevtransform!, :plan_chebyshevtransform, 
+            :plan_chebyshevutransform!, :plan_chebyshevutransform, 
+            :plan_ichebyshevutransform, :plan_ichebyshevutransform!, 
+            :plan_ichebyshevtransform, :plan_ichebyshevtransform!)
+    @eval begin
+        $pln(x::AbstractArray, dims...; kws...) = $pln(x, Val(1), dims...; kws...)
+        $pln(::Type{T}, szs, dims...; kwds...) where T = $pln(Array{T}(undef, szs...), dims...; kwds...)
+    end
+end
diff --git a/src/clenshawcurtis.jl b/src/clenshawcurtis.jl
index 05f4d85a..535f9139 100644
--- a/src/clenshawcurtis.jl
+++ b/src/clenshawcurtis.jl
@@ -3,7 +3,7 @@ plan_clenshawcurtis(μ) = length(μ) > 1 ? FFTW.plan_r2r!(μ, FFTW.REDFT00) : fi
 """
 Compute nodes of the Clenshaw—Curtis quadrature rule.
 """
-clenshawcurtisnodes(::Type{T}, N::Int) where T = chebyshevpoints(T, N; kind = 2)
+clenshawcurtisnodes(::Type{T}, N::Int) where T = chebyshevpoints(T, N, Val(2))
 
 """
 Compute weights of the Clenshaw—Curtis quadrature rule with modified Chebyshev moments of the first kind ``\\mu``.
diff --git a/src/docstrings.jl b/src/docstrings.jl
new file mode 100644
index 00000000..c3ecd7a3
--- /dev/null
+++ b/src/docstrings.jl
@@ -0,0 +1,121 @@
+"""
+	leg2cheb(v::AbstractVector; normleg::Bool=false, normcheb::Bool=false)
+
+Convert the vector of expansions coefficients `v` from a Legendre to a Chebyshev basis.
+The keyword arguments denote whether the bases are normalized.
+"""
+leg2cheb
+
+"""
+	cheb2leg(v::AbstractVector; normcheb::Bool=false, normleg::Bool=false)
+
+Convert the vector of expansions coefficients `v` from a Chebyshev to a Legendre basis.
+The keyword arguments denote whether the bases are normalized.
+"""
+cheb2leg
+
+"""
+	ultra2ultra(v::AbstractVector, λ, μ; norm1::Bool=false, norm2::Bool=false)
+
+Convert the vector of expansions coefficients `v` from an Ultraspherical basis of
+order `λ` to an Ultraspherical basis of order `μ`.
+The keyword arguments denote whether the bases are normalized.
+"""
+ultra2ultra
+
+"""
+	jac2jac(v::AbstractVector, α, β, γ, δ; norm1::Bool=false, norm2::Bool=false)
+
+Convert the vector of expansions coefficients `v` from a Jacobi basis of
+order `(α,β)` to a Jacobi basis of order `(γ,δ)`.
+The keyword arguments denote whether the bases are normalized.
+"""
+jac2jac
+
+"""
+	lag2lag(v::AbstractVector, α, β; norm1::Bool=false, norm2::Bool=false)
+
+Convert the vector of expansions coefficients `v` from a Laguerre basis of
+order `α` to a La basis of order `β`.
+The keyword arguments denote whether the bases are normalized."""
+lag2lag
+
+"""
+	jac2ultra(v::AbstractVector, α, β, λ; normjac::Bool=false, normultra::Bool=false)
+
+Convert the vector of expansions coefficients `v` from a Jacobi basis of
+order `(α,β)` to an Ultraspherical basis of order `λ`.
+The keyword arguments denote whether the bases are normalized."""
+jac2ultra
+
+"""
+	ultra2jac(v::AbstractVector, λ, α, β; normultra::Bool=false, normjac::Bool=false)
+
+Convert the vector of expansions coefficients `v` from an Ultraspherical basis of
+order `λ` to a Jacobi basis of order `(α,β)`.
+The keyword arguments denote whether the bases are normalized.
+"""
+ultra2jac
+
+"""
+	jac2cheb(v::AbstractVector, α, β; normjac::Bool=false, normcheb::Bool=false)
+
+Convert the vector of expansions coefficients `v` from a Jacobi basis of
+order `(α,β)` to a Chebyshev basis.
+The keyword arguments denote whether the bases are normalized.
+"""
+jac2cheb
+
+"""
+	cheb2jac(v::AbstractVector, α, β; normcheb::Bool=false, normjac::Bool=false)
+
+Convert the vector of expansions coefficients `v` from a Chebyshev basis to a
+Jacobi basis of order `(α,β)`.
+The keyword arguments denote whether the bases are normalized.
+"""
+cheb2jac
+
+"""
+	ultra2cheb(v::AbstractVector, λ; normultra::Bool=false, normcheb::Bool=false)
+
+Convert the vector of expansions coefficients `v` from an Ultraspherical basis of
+order `λ` to a Chebyshev basis.
+The keyword arguments denote whether the bases are normalized.
+"""
+ultra2cheb
+
+"""
+	cheb2ultra(v::AbstractVector, λ; normcheb::Bool=false, normultra::Bool=false)
+
+Convert the vector of expansions coefficients `v` from a Chebyshev basis
+to an Ultraspherical basis of order `λ`.
+The keyword arguments denote whether the bases are normalized.
+"""
+cheb2ultra
+
+"""
+	associatedjac2jac(v::AbstractVector, c::Integer, α, β, γ, δ; norm1::Bool=false, norm2::Bool=false)
+
+Convert the vector of expansions coefficients `v` from an associated Jacobi basis
+of orders `(α,β)` to a Jacobi basis of order `(γ,δ)`.
+The keyword arguments denote whether the bases are normalized.
+"""
+associatedjac2jac
+
+"""
+	modifiedjac2jac(v::AbstractVector{T}, α, β, u::Vector{T}; verbose::Bool=false) where {T}
+	modifiedjac2jac(v::AbstractVector{T}, α, β, u::Vector{T}, v::Vector{T}; verbose::Bool=false) where {T}
+"""
+modifiedjac2jac
+
+"""
+	modifiedlag2lag(v::AbstractVector{T}, α, u::Vector{T}; verbose::Bool=false)
+	modifiedlag2lag(v::AbstractVector{T}, α, u::Vector{T}, v::Vector{T}; verbose::Bool=false) where {T}
+"""
+modifiedlag2lag
+
+"""
+	modifiedherm2herm(v::AbstractVector{T}, u::Vector{T}; verbose::Bool=false)
+	modifiedherm2herm(v::AbstractVector{T}, u::Vector{T}, v::Vector{T}; verbose::Bool=false) where {T}
+"""
+modifiedherm2herm
diff --git a/src/elliptic.jl b/src/elliptic.jl
new file mode 100644
index 00000000..ccf52f10
--- /dev/null
+++ b/src/elliptic.jl
@@ -0,0 +1,121 @@
+"""
+`FastTransforms` submodule for the computation of some elliptic integrals and functions.
+
+Complete elliptic integrals of the first and second kinds:
+```math
+K(k) = \\int_0^{\\frac{\\pi}{2}} \\frac{{\\rm d}\\theta}{\\sqrt{1-k^2\\sin^2\\theta}},\\quad{\\rm and},
+```
+```math
+E(k) = \\int_0^{\\frac{\\pi}{2}} \\sqrt{1-k^2\\sin^2\\theta} {\\rm\\,d}\\theta.
+```
+
+Jacobian elliptic functions:
+```math
+x = \\int_0^{\\operatorname{sn}(x,k)} \\frac{{\\rm d}t}{\\sqrt{(1-t^2)(1-k^2t^2)}},
+```
+```math
+x = \\int_{\\operatorname{cn}(x,k)}^1 \\frac{{\\rm d}t}{\\sqrt{(1-t^2)[1-k^2(1-t^2)]}},
+```
+```math
+x = \\int_{\\operatorname{dn}(x,k)}^1 \\frac{{\\rm d}t}{\\sqrt{(1-t^2)(t^2-1+k^2)}},
+```
+and the remaining nine are defined by:
+```math
+\\operatorname{pq}(x,k) = \\frac{\\operatorname{pr}(x,k)}{\\operatorname{qr}(x,k)} = \\frac{1}{\\operatorname{qp}(x,k)}.
+```
+"""
+module Elliptic
+
+import FastTransforms: libfasttransforms
+
+export K, E,
+       sn, cn, dn, ns, nc, nd,
+       sc, cs, sd, ds, cd, dc
+
+for (fC, elty) in ((:ft_complete_elliptic_integralf, :Float32), (:ft_complete_elliptic_integral, :Float64))
+    @eval begin
+        function K(k::$elty)
+            return ccall(($(string(fC)), libfasttransforms), $elty, (Cint, $elty), '1', k)
+        end
+        function E(k::$elty)
+            return ccall(($(string(fC)), libfasttransforms), $elty, (Cint, $elty), '2', k)
+        end
+    end
+end
+
+const SN = UInt(1)
+const CN = UInt(2)
+const DN = UInt(4)
+
+for (fC, elty) in ((:ft_jacobian_elliptic_functionsf, :Float32), (:ft_jacobian_elliptic_functions, :Float64))
+    @eval begin
+        function sn(x::$elty, k::$elty)
+            retsn = Ref{$elty}()
+            ccall(($(string(fC)), libfasttransforms), Cvoid, ($elty, $elty, Ptr{$elty}, Ptr{$elty}, Ptr{$elty}, UInt), x, k, retsn, C_NULL, C_NULL, SN)
+            retsn[]
+        end
+        function cn(x::$elty, k::$elty)
+            retcn = Ref{$elty}()
+            ccall(($(string(fC)), libfasttransforms), Cvoid, ($elty, $elty, Ptr{$elty}, Ptr{$elty}, Ptr{$elty}, UInt), x, k, C_NULL, retcn, C_NULL, CN)
+            retcn[]
+        end
+        function dn(x::$elty, k::$elty)
+            retdn = Ref{$elty}()
+            ccall(($(string(fC)), libfasttransforms), Cvoid, ($elty, $elty, Ptr{$elty}, Ptr{$elty}, Ptr{$elty}, UInt), x, k, C_NULL, C_NULL, retdn, DN)
+            retdn[]
+        end
+        function ns(x::$elty, k::$elty)
+            retsn = Ref{$elty}()
+            ccall(($(string(fC)), libfasttransforms), Cvoid, ($elty, $elty, Ptr{$elty}, Ptr{$elty}, Ptr{$elty}, UInt), x, k, retsn, C_NULL, C_NULL, SN)
+            inv(retsn[])
+        end
+        function nc(x::$elty, k::$elty)
+            retcn = Ref{$elty}()
+            ccall(($(string(fC)), libfasttransforms), Cvoid, ($elty, $elty, Ptr{$elty}, Ptr{$elty}, Ptr{$elty}, UInt), x, k, C_NULL, retcn, C_NULL, CN)
+            inv(retcn[])
+        end
+        function nd(x::$elty, k::$elty)
+            retdn = Ref{$elty}()
+            ccall(($(string(fC)), libfasttransforms), Cvoid, ($elty, $elty, Ptr{$elty}, Ptr{$elty}, Ptr{$elty}, UInt), x, k, C_NULL, C_NULL, retdn, DN)
+            inv(retdn[])
+        end
+        function sc(x::$elty, k::$elty)
+            retsn = Ref{$elty}()
+            retcn = Ref{$elty}()
+            ccall(($(string(fC)), libfasttransforms), Cvoid, ($elty, $elty, Ptr{$elty}, Ptr{$elty}, Ptr{$elty}, UInt), x, k, retsn, retcn, C_NULL, SN & CN)
+            retsn[]/retcn[]
+        end
+        function cs(x::$elty, k::$elty)
+            retsn = Ref{$elty}()
+            retcn = Ref{$elty}()
+            ccall(($(string(fC)), libfasttransforms), Cvoid, ($elty, $elty, Ptr{$elty}, Ptr{$elty}, Ptr{$elty}, UInt), x, k, retsn, retcn, C_NULL, SN & CN)
+            retcn[]/retsn[]
+        end
+        function sd(x::$elty, k::$elty)
+            retsn = Ref{$elty}()
+            retdn = Ref{$elty}()
+            ccall(($(string(fC)), libfasttransforms), Cvoid, ($elty, $elty, Ptr{$elty}, Ptr{$elty}, Ptr{$elty}, UInt), x, k, retsn, C_NULL, retdn, SN & DN)
+            retsn[]/retdn[]
+        end
+        function ds(x::$elty, k::$elty)
+            retsn = Ref{$elty}()
+            retdn = Ref{$elty}()
+            ccall(($(string(fC)), libfasttransforms), Cvoid, ($elty, $elty, Ptr{$elty}, Ptr{$elty}, Ptr{$elty}, UInt), x, k, retsn, C_NULL, retdn, SN & DN)
+            retdn[]/retsn[]
+        end
+        function cd(x::$elty, k::$elty)
+            retcn = Ref{$elty}()
+            retdn = Ref{$elty}()
+            ccall(($(string(fC)), libfasttransforms), Cvoid, ($elty, $elty, Ptr{$elty}, Ptr{$elty}, Ptr{$elty}, UInt), x, k, C_NULL, retcn, retdn, CN & DN)
+            retcn[]/retdn[]
+        end
+        function dc(x::$elty, k::$elty)
+            retcn = Ref{$elty}()
+            retdn = Ref{$elty}()
+            ccall(($(string(fC)), libfasttransforms), Cvoid, ($elty, $elty, Ptr{$elty}, Ptr{$elty}, Ptr{$elty}, UInt), x, k, C_NULL, retcn, retdn, CN & DN)
+            retdn[]/retcn[]
+        end
+    end
+end
+
+end # module
diff --git a/src/fejer.jl b/src/fejer.jl
index e51ef548..096b5592 100644
--- a/src/fejer.jl
+++ b/src/fejer.jl
@@ -3,7 +3,7 @@ plan_fejer1(μ) = FFTW.plan_r2r!(μ, FFTW.REDFT01)
 """
 Compute nodes of Fejer's first quadrature rule.
 """
-fejernodes1(::Type{T}, N::Int) where T = chebyshevpoints(T, N; kind = 1)
+fejernodes1(::Type{T}, N::Int) where T = chebyshevpoints(T, N, Val(1))
 
 """
 Compute weights of Fejer's first quadrature rule with modified Chebyshev moments of the first kind ``\\mu``.
diff --git a/src/fftBigFloat.jl b/src/fftBigFloat.jl
deleted file mode 100644
index e43cf056..00000000
--- a/src/fftBigFloat.jl
+++ /dev/null
@@ -1,278 +0,0 @@
-const AbstractFloats = Union{AbstractFloat,Complex{T} where T<:AbstractFloat}
-
-# We use these type definitions for clarity
-const RealFloats = T where T<:AbstractFloat
-const ComplexFloats = Complex{T} where T<:AbstractFloat
-
-
-# The following implements Bluestein's algorithm, following http://www.dsprelated.com/dspbooks/mdft/Bluestein_s_FFT_Algorithm.html
-# To add more types, add them in the union of the function's signature.
-
-function generic_fft(x::Vector{T}) where T<:AbstractFloats
-    T <: FFTW.fftwNumber && (@warn("Using generic fft for FFTW number type."))
-    n = length(x)
-    ispow2(n) && return generic_fft_pow2(x)
-    ks = range(zero(real(T)),stop=n-one(real(T)),length=n)
-    Wks = exp.((-im).*convert(T,π).*ks.^2 ./ n)
-    xq, wq = x.*Wks, conj([exp(-im*convert(T,π)*n);reverse(Wks);Wks[2:end]])
-    return Wks.*conv(xq,wq)[n+1:2n]
-end
-
-
-function generic_fft!(x::Vector{T}) where T<:AbstractFloats
-    x[:] = generic_fft(x)
-    return x
-end
-
-# add rfft for AbstractFloat, by calling fft
-generic_rfft(v::Vector{T}) where T<:AbstractFloats = generic_fft(v)[1:div(length(v),2)+1]
-
-function generic_irfft(v::Vector{T}, n::Integer) where T<:ComplexFloats
-    @assert n==2length(v)-1
-    r = Vector{T}(undef, n)
-    r[1:length(v)]=v
-    r[length(v)+1:end]=reverse(conj(v[2:end]))
-    real(generic_ifft(r))
-end
-
-generic_bfft(x::Vector{T}) where {T <: AbstractFloats} = conj!(generic_fft(conj(x)))
-function generic_bfft!(x::Vector{T}) where {T <: AbstractFloats}
-    x[:] = generic_bfft(x)
-    return x
-end
-
-generic_brfft(v::Vector, n::Integer) = generic_irfft(v, n)*n
-
-generic_ifft(x::Vector{T}) where {T<:AbstractFloats} = conj!(generic_fft(conj(x)))/length(x)
-function generic_ifft!(x::Vector{T}) where T<:AbstractFloats
-    x[:] = generic_ifft(x)
-    return x
-end
-
-function conv(u::StridedVector{T}, v::StridedVector{T}) where T<:AbstractFloats
-    nu,nv = length(u),length(v)
-    n = nu + nv - 1
-    np2 = nextpow(2,n)
-    append!(u,zeros(T,np2-nu)),append!(v,zeros(T,np2-nv))
-    y = generic_ifft_pow2(generic_fft_pow2(u).*generic_fft_pow2(v))
-    #TODO This would not handle Dual/ComplexDual numbers correctly
-    y = T<:Real ? real(y[1:n]) : y[1:n]
-end
-
-# This is a Cooley-Tukey FFT algorithm inspired by many widely available algorithms including:
-# c_radix2.c in the GNU Scientific Library and four1 in the Numerical Recipes in C.
-# However, the trigonometric recurrence is improved for greater efficiency.
-# The algorithm starts with bit-reversal, then divides and conquers in-place.
-function generic_fft_pow2!(x::Vector{T}) where T<:AbstractFloat
-    n,big2=length(x),2one(T)
-    nn,j=n÷2,1
-    for i=1:2:n-1
-        if j>i
-            x[j], x[i] = x[i], x[j]
-            x[j+1], x[i+1] = x[i+1], x[j+1]
-        end
-        m = nn
-        while m ≥ 2 && j > m
-            j -= m
-            m = m÷2
-        end
-        j += m
-    end
-    logn = 2
-    while logn < n
-        θ=-big2/logn
-        wtemp = sinpi(θ/2)
-        wpr, wpi = -2wtemp^2, sinpi(θ)
-        wr, wi = one(T), zero(T)
-        for m=1:2:logn-1
-            for i=m:2logn:n
-                j=i+logn
-                mixr, mixi = wr*x[j]-wi*x[j+1], wr*x[j+1]+wi*x[j]
-                x[j], x[j+1] = x[i]-mixr, x[i+1]-mixi
-                x[i], x[i+1] = x[i]+mixr, x[i+1]+mixi
-            end
-            wr = (wtemp=wr)*wpr-wi*wpi+wr
-            wi = wi*wpr+wtemp*wpi+wi
-        end
-        logn = logn << 1
-    end
-    return x
-end
-
-function generic_fft_pow2(x::Vector{Complex{T}}) where T<:AbstractFloat
-    y = interlace(real(x),imag(x))
-    generic_fft_pow2!(y)
-    return complex.(y[1:2:end],y[2:2:end])
-end
-generic_fft_pow2(x::Vector{T}) where {T<:AbstractFloat} = generic_fft_pow2(complex(x))
-
-function generic_ifft_pow2(x::Vector{Complex{T}}) where T<:AbstractFloat
-    y = interlace(real(x),-imag(x))
-    generic_fft_pow2!(y)
-    return complex.(y[1:2:end],-y[2:2:end])/length(x)
-end
-
-function generic_dct(a::AbstractVector{Complex{T}}) where {T <: AbstractFloat}
-    T <: FFTW.fftwNumber && (@warn("Using generic dct for FFTW number type."))
-    N = length(a)
-    twoN = convert(T,2) * N
-    c = generic_fft([a; reverse(a, dims=1)]) # c = generic_fft([a; flipdim(a,1)])
-    d = c[1:N]
-    d .*= exp.((-im*convert(T, pi)).*(0:N-1)./twoN)
-    d[1] = d[1] / sqrt(convert(T, 2))
-    lmul!(inv(sqrt(twoN)), d)
-end
-
-generic_dct(a::AbstractArray{T}) where {T <: AbstractFloat} = real(generic_dct(complex(a)))
-
-function generic_idct(a::AbstractVector{Complex{T}}) where {T <: AbstractFloat}
-    T <: FFTW.fftwNumber && (@warn("Using generic idct for FFTW number type."))
-    N = length(a)
-    twoN = convert(T,2)*N
-    b = a * sqrt(twoN)
-    b[1] = b[1] * sqrt(convert(T,2))
-    shift = exp.(-im * 2 * convert(T, pi) * (N - convert(T,1)/2) * (0:(2N-1)) / twoN)
-    b = [b; 0; -reverse(b[2:end], dims=1)] .* shift # b = [b; 0; -flipdim(b[2:end],1)] .* shift
-    c = ifft(b)
-    reverse(c[1:N]; dims=1)#flipdim(c[1:N],1)
-end
-
-generic_idct(a::AbstractArray{T}) where {T <: AbstractFloat} = real(generic_idct(complex(a)))
-
-generic_dct!(a::AbstractArray{T}) where {T<:AbstractFloats} = (b = generic_dct(a); a[:] = b)
-generic_idct!(a::AbstractArray{T}) where {T<:AbstractFloats} = (b = generic_idct(a); a[:] = b)
-
-# These lines mimick the corresponding ones in FFTW/src/dct.jl, but with
-# AbstractFloat rather than fftwNumber.
-for f in (:dct, :dct!, :idct, :idct!)
-    pf = Symbol("plan_", f)
-    @eval begin
-        $f(x::AbstractArray{<:AbstractFloats}) = $pf(x) * x
-        $f(x::AbstractArray{<:AbstractFloats}, region) = $pf(x, region) * x
-    end
-end
-
-# dummy plans
-abstract type DummyPlan{T} <: Plan{T} end
-for P in (:DummyFFTPlan, :DummyiFFTPlan, :DummybFFTPlan, :DummyDCTPlan, :DummyiDCTPlan)
-    # All plans need an initially undefined pinv field
-    @eval begin
-        mutable struct $P{T,inplace} <: DummyPlan{T}
-            pinv::DummyPlan{T}
-            $P{T,inplace}() where {T<:AbstractFloats, inplace} = new()
-        end
-    end
-end
-for P in (:DummyrFFTPlan, :DummyirFFTPlan, :DummybrFFTPlan)
-    @eval begin
-        mutable struct $P{T,inplace} <: DummyPlan{T}
-            n::Integer
-            pinv::DummyPlan{T}
-            $P{T,inplace}(n::Integer) where {T<:AbstractFloats, inplace} = new(n)
-        end
-    end
-end
-
-for (Plan,iPlan) in ((:DummyFFTPlan,:DummyiFFTPlan),
-                     (:DummyDCTPlan,:DummyiDCTPlan))
-   @eval begin
-       plan_inv(::$Plan{T,inplace}) where {T,inplace} = $iPlan{T,inplace}()
-       plan_inv(::$iPlan{T,inplace}) where {T,inplace} = $Plan{T,inplace}()
-    end
-end
-
-# Specific for rfft, irfft and brfft:
-plan_inv(p::DummyirFFTPlan{T,inplace}) where {T,inplace} = DummyrFFTPlan{T,Inplace}(p.n)
-plan_inv(p::DummyrFFTPlan{T,inplace}) where {T,inplace} = DummyirFFTPlan{T,Inplace}(p.n)
-
-
-
-for (Plan,ff,ff!) in ((:DummyFFTPlan,:generic_fft,:generic_fft!),
-                      (:DummybFFTPlan,:generic_bfft,:generic_bfft!),
-                      (:DummyiFFTPlan,:generic_ifft,:generic_ifft!),
-                      (:DummyrFFTPlan,:generic_rfft,:generic_rfft!),
-                      (:DummyDCTPlan,:generic_dct,:generic_dct!),
-                      (:DummyiDCTPlan,:generic_idct,:generic_idct!))
-    @eval begin
-        *(p::$Plan{T,true}, x::StridedArray{T,N}) where {T<:AbstractFloats,N} = $ff!(x)
-        *(p::$Plan{T,false}, x::StridedArray{T,N}) where {T<:AbstractFloats,N} = $ff(x)
-        function mul!(C::StridedVector, p::$Plan, x::StridedVector)
-            C[:] = $ff(x)
-            C
-        end
-    end
-end
-
-# Specific for irfft and brfft:
-*(p::DummyirFFTPlan{T,true}, x::StridedArray{T,N}) where {T<:AbstractFloats,N} = generic_irfft!(x, p.n)
-*(p::DummyirFFTPlan{T,false}, x::StridedArray{T,N}) where {T<:AbstractFloats,N} = generic_irfft(x, p.n)
-function mul!(C::StridedVector, p::DummyirFFTPlan, x::StridedVector)
-    C[:] = generic_irfft(x, p.n)
-    C
-end
-*(p::DummybrFFTPlan{T,true}, x::StridedArray{T,N}) where {T<:AbstractFloats,N} = generic_brfft!(x, p.n)
-*(p::DummybrFFTPlan{T,false}, x::StridedArray{T,N}) where {T<:AbstractFloats,N} = generic_brfft(x, p.n)
-function mul!(C::StridedVector, p::DummybrFFTPlan, x::StridedVector)
-    C[:] = generic_brfft(x, p.n)
-    C
-end
-
-
-# We override these for AbstractFloat, so that conversion from reals to
-# complex numbers works for any AbstractFloat (instead of only BlasFloat's)
-AbstractFFTs.complexfloat(x::StridedArray{Complex{<:AbstractFloat}}) = x
-AbstractFFTs.realfloat(x::StridedArray{<:Real}) = x
-# We override this one in order to avoid throwing an error that the type is
-# unsupported (as defined in AbstractFFTs)
-AbstractFFTs._fftfloat(::Type{T}) where {T <: AbstractFloat} = T
-
-
-# We intercept the calls to plan_X(x, region) below.
-# In order not to capture any calls that should go to FFTW, we have to be
-# careful about the typing, so that the calls to FFTW remain more specific.
-# This is the reason for using StridedArray below. We also have to carefully
-# distinguish between real and complex arguments.
-
-plan_fft(x::StridedArray{T}, region) where {T <: ComplexFloats} = DummyFFTPlan{Complex{real(T)},false}()
-plan_fft!(x::StridedArray{T}, region) where {T <: ComplexFloats} = DummyFFTPlan{Complex{real(T)},true}()
-
-plan_bfft(x::StridedArray{T}, region) where {T <: ComplexFloats} = DummybFFTPlan{Complex{real(T)},false}()
-plan_bfft!(x::StridedArray{T}, region) where {T <: ComplexFloats} = DummybFFTPlan{Complex{real(T)},true}()
-
-# The ifft plans are automatically provided in terms of the bfft plans above.
-# plan_ifft(x::StridedArray{T}, region) where {T <: ComplexFloats} = DummyiFFTPlan{Complex{real(T)},false}()
-# plan_ifft!(x::StridedArray{T}, region) where {T <: ComplexFloats} = DummyiFFTPlan{Complex{real(T)},true}()
-
-plan_dct(x::StridedArray{T}, region) where {T <: AbstractFloats} = DummyDCTPlan{T,false}()
-plan_dct!(x::StridedArray{T}, region) where {T <: AbstractFloats} = DummyDCTPlan{T,true}()
-
-plan_idct(x::StridedArray{T}, region) where {T <: AbstractFloats} = DummyiDCTPlan{T,false}()
-plan_idct!(x::StridedArray{T}, region) where {T <: AbstractFloats} = DummyiDCTPlan{T,true}()
-
-plan_rfft(x::StridedArray{T}, region) where {T <: RealFloats} = DummyrFFTPlan{Complex{real(T)},false}(length(x))
-plan_brfft(x::StridedArray{T}, n::Integer, region) where {T <: ComplexFloats} = DummybrFFTPlan{Complex{real(T)},false}(n)
-
-# A plan for irfft is created in terms of a plan for brfft.
-# plan_irfft(x::StridedArray{T}, n::Integer, region) where {T <: ComplexFloats} = DummyirFFTPlan{Complex{real(T)},false}(n)
-
-# These don't exist for now:
-# plan_rfft!(x::StridedArray{T}) where {T <: RealFloats} = DummyrFFTPlan{Complex{real(T)},true}()
-# plan_irfft!(x::StridedArray{T},n::Integer) where {T <: RealFloats} = DummyirFFTPlan{Complex{real(T)},true}()
-
-function interlace(a::Vector{S},b::Vector{V}) where {S<:Number,V<:Number}
-    na=length(a);nb=length(b)
-    T=promote_type(S,V)
-    if nb≥na
-        ret=zeros(T,2nb)
-        ret[1:2:1+2*(na-1)]=a
-        ret[2:2:end]=b
-        ret
-    else
-        ret=zeros(T,2na-1)
-        ret[1:2:end]=a
-        if !isempty(b)
-            ret[2:2:2+2*(nb-1)]=b
-        end
-        ret
-    end
-end
diff --git a/src/inufft.jl b/src/inufft.jl
index 6701412c..a843c59a 100644
--- a/src/inufft.jl
+++ b/src/inufft.jl
@@ -3,9 +3,9 @@ Pre-computes an inverse nonuniform fast Fourier transform of type `N`.
 
 For best performance, choose the right number of threads by `FFTW.set_num_threads(4)`, for example.
 """
-struct iNUFFTPlan{N,T,S,PT} <: Plan{T}
+struct iNUFFTPlan{N,T,S,PT,TF} <: Plan{T}
     pt::PT
-    TP::Toeplitz{T}
+    TP::TF
     r::Vector{T}
     p::Vector{T}
     Ap::Vector{T}
@@ -24,12 +24,12 @@ function plan_inufft1(ω::AbstractVector{T}, ϵ::T) where T<:AbstractFloat
     avg = (r[1]+c[1])/2
     r[1] = avg
     c[1] = avg
-    TP = Toeplitz(c, r)
+    TP = factorize(Toeplitz(c, r))
     r = zero(c)
     p = zero(c)
     Ap = zero(c)
 
-    iNUFFTPlan{1, eltype(TP), typeof(ϵ), typeof(pt)}(pt, TP, r, p, Ap, ϵ)
+    iNUFFTPlan{1, eltype(TP), typeof(ϵ), typeof(pt), typeof(TP)}(pt, TP, r, p, Ap, ϵ)
 end
 
 """
@@ -43,12 +43,12 @@ function plan_inufft2(x::AbstractVector{T}, ϵ::T) where T<:AbstractFloat
     avg = (r[1]+c[1])/2
     r[1] = avg
     c[1] = avg
-    TP = Toeplitz(c, r)
+    TP = factorize(Toeplitz(c, r))
     r = zero(c)
     p = zero(c)
     Ap = zero(c)
 
-    iNUFFTPlan{2, eltype(TP), typeof(ϵ), typeof(pt)}(pt, TP, r, p, Ap, ϵ)
+    iNUFFTPlan{2, eltype(TP), typeof(ϵ), typeof(pt), typeof(TP)}(pt, TP, r, p, Ap, ϵ)
 end
 
 
@@ -80,10 +80,8 @@ Computes an inverse nonuniform fast Fourier transform of type II.
 """
 inufft2(c::AbstractVector, x::AbstractVector{T}, ϵ::T) where {T<:AbstractFloat} = plan_inufft2(x, ϵ)*c
 
-function cg_for_inufft(A::ToeplitzMatrices.AbstractToeplitz{T}, x::AbstractVector{T}, b::AbstractVector{T}, r::AbstractVector{T}, p::AbstractVector{T}, Ap::AbstractVector{T}, max_it::Integer, tol::Real) where T
+function cg_for_inufft(A::ToeplitzMatrices.ToeplitzFactorization{T}, x::AbstractVector{T}, b::AbstractVector{T}, r::AbstractVector{T}, p::AbstractVector{T}, Ap::AbstractVector{T}, max_it::Integer, tol::Real) where T
 	n = length(b)
-	n1, n2 = size(A)
-	n == n1 == n2 || throw(DimensionMismatch(""))
     nrmb = norm(b)
     if nrmb == 0 nrmb = one(typeof(nrmb)) end
 	copyto!(x, b)
diff --git a/src/libfasttransforms.jl b/src/libfasttransforms.jl
index 0c7d4207..3ce492d9 100644
--- a/src/libfasttransforms.jl
+++ b/src/libfasttransforms.jl
@@ -1,17 +1,22 @@
-const libfasttransforms = find_library("libfasttransforms", [joinpath(dirname(@__DIR__), "deps")])
-
-if libfasttransforms ≡ nothing || length(libfasttransforms) == 0
-    error("FastTransforms is not properly installed. Please run Pkg.build(\"FastTransforms\") ",
-          "and restart Julia.")
+if get(ENV, "FT_BUILD_FROM_SOURCE", "false") == "true"
+    using Libdl
+    const libfasttransforms = find_library("libfasttransforms", [joinpath(dirname(@__DIR__), "deps")])
+    if libfasttransforms ≡ nothing || length(libfasttransforms) == 0
+        error("FastTransforms is not properly installed. Please run Pkg.build(\"FastTransforms\") ",
+              "and restart Julia.")
+    end
+else
+    using FastTransforms_jll
 end
 
-function ft_fftw_plan_with_nthreads(n::Integer)
-    ccall((:ft_fftw_plan_with_nthreads, libfasttransforms), Cvoid, (Cint, ), n)
-end
+ft_set_num_threads(n::Integer) = ccall((:ft_set_num_threads, libfasttransforms), Cvoid, (Cint, ), n)
+ft_fftw_plan_with_nthreads(n::Integer) = ccall((:ft_fftw_plan_with_nthreads, libfasttransforms), Cvoid, (Cint, ), n)
 
 function __init__()
+    n = ceil(Int, Sys.CPU_THREADS/2)
+    ft_set_num_threads(n)
     ccall((:ft_fftw_init_threads, libfasttransforms), Cint, ())
-    ft_fftw_plan_with_nthreads(ceil(Int, Sys.CPU_THREADS/2))
+    ft_fftw_plan_with_nthreads(n)
 end
 
 
@@ -27,43 +32,115 @@ struct mpfr_t <: AbstractFloat
     d::Ptr{Limb}
 end
 
-mpfr_t(x::BigFloat) = mpfr_t(x.prec, x.sign, x.exp, x.d)
-
-function BigFloat(x::mpfr_t)
-    nb = ccall((:mpfr_custom_get_size,:libmpfr), Csize_t, (Clong,), precision(BigFloat))
-    nb = (nb + Core.sizeof(Limb) - 1) ÷ Core.sizeof(Limb) # align to number of Limb allocations required for this
-    str = unsafe_string(Ptr{UInt8}(x.d), nb * Core.sizeof(Limb))
-    _BigFloat(x.prec, x.sign, x.exp, str)
-end
-
-set_num_threads(n::Integer) = ccall((:ft_set_num_threads, libfasttransforms), Cvoid, (Cint, ), n)
-
-const LEG2CHEB              = 0
-const CHEB2LEG              = 1
-const ULTRA2ULTRA           = 2
-const JAC2JAC               = 3
-const LAG2LAG               = 4
-const JAC2ULTRA             = 5
-const ULTRA2JAC             = 6
-const JAC2CHEB              = 7
-const CHEB2JAC              = 8
-const ULTRA2CHEB            = 9
-const CHEB2ULTRA           = 10
-const SPHERE               = 11
-const SPHEREV              = 12
-const DISK                 = 13
-const TRIANGLE             = 14
-const TETRAHEDRON          = 15
-const SPHERESYNTHESIS      = 16
-const SPHEREANALYSIS       = 17
-const SPHEREVSYNTHESIS     = 18
-const SPHEREVANALYSIS      = 19
-const DISKSYNTHESIS        = 20
-const DISKANALYSIS         = 21
-const TRIANGLESYNTHESIS    = 22
-const TRIANGLEANALYSIS     = 23
-const TETRAHEDRONSYNTHESIS = 24
-const TETRAHEDRONANALYSIS  = 25
+"""
+`BigFloat` is a mutable struct and there is no guarantee that each entry in an
+`AbstractArray{BigFloat}` is unique. For example, looking at the `Limb`s,
+
+    Id = Matrix{BigFloat}(I, 3, 3)
+    map(x->x.d, Id)
+
+shows that the ones and the zeros all share the same pointers. If a C function
+assumes unicity of each datum, then the array must be renewed with a `deepcopy`.
+"""
+function renew!(x::AbstractArray{BigFloat})
+    for i in eachindex(x)
+        @inbounds x[i] = deepcopy(x[i])
+    end
+    return x
+end
+
+function horner!(f::Vector{Float64}, c::StridedVector{Float64}, x::Vector{Float64})
+    @assert length(x) == length(f)
+    ccall((:ft_horner, libfasttransforms), Cvoid, (Cint, Ptr{Float64}, Cint, Cint, Ptr{Float64}, Ptr{Float64}), length(c), c, stride(c, 1), length(x), x, f)
+    f
+end
+
+function horner!(f::Vector{Float32}, c::StridedVector{Float32}, x::Vector{Float32})
+    @assert length(x) == length(f)
+    ccall((:ft_hornerf, libfasttransforms), Cvoid, (Cint, Ptr{Float32}, Cint, Cint, Ptr{Float32}, Ptr{Float32}), length(c), c, stride(c, 1), length(x), x, f)
+    f
+end
+
+function check_clenshaw_points(x, ϕ₀, f)
+    length(x) == length(ϕ₀) == length(f) || throw(ArgumentError("Dimensions must match"))
+end
+
+function check_clenshaw_points(x, f)
+    length(x) == length(f) || throw(ArgumentError("Dimensions must match"))
+end
+
+function clenshaw!(f::Vector{Float64}, c::StridedVector{Float64}, x::Vector{Float64})
+    @boundscheck check_clenshaw_points(x, f)
+    ccall((:ft_clenshaw, libfasttransforms), Cvoid, (Cint, Ptr{Float64}, Cint, Cint, Ptr{Float64}, Ptr{Float64}), length(c), c, stride(c, 1), length(x), x, f)
+    f
+end
+
+function clenshaw!(f::Vector{Float32}, c::StridedVector{Float32}, x::Vector{Float32})
+    @boundscheck check_clenshaw_points(x, f)
+    ccall((:ft_clenshawf, libfasttransforms), Cvoid, (Cint, Ptr{Float32}, Cint, Cint, Ptr{Float32}, Ptr{Float32}), length(c), c, stride(c, 1), length(x), x, f)
+    f
+end
+
+function clenshaw!(f::Vector{Float64}, c::StridedVector{Float64}, A::Vector{Float64}, B::Vector{Float64}, C::Vector{Float64}, x::Vector{Float64}, ϕ₀::Vector{Float64})
+    N = length(c)
+    @boundscheck check_clenshaw_recurrences(N, A, B, C)
+    @boundscheck check_clenshaw_points(x, ϕ₀, f)
+    ccall((:ft_orthogonal_polynomial_clenshaw, libfasttransforms), Cvoid, (Cint, Ptr{Float64}, Cint, Ptr{Float64}, Ptr{Float64}, Ptr{Float64}, Cint, Ptr{Float64}, Ptr{Float64}, Ptr{Float64}), N, c, stride(c, 1), A, B, C, length(x), x, ϕ₀, f)
+    f
+end
+
+function clenshaw!(f::Vector{Float32}, c::StridedVector{Float32}, A::Vector{Float32}, B::Vector{Float32}, C::Vector{Float32}, x::Vector{Float32}, ϕ₀::Vector{Float32})
+    N = length(c)
+    @boundscheck check_clenshaw_recurrences(N, A, B, C)
+    @boundscheck check_clenshaw_points(x, ϕ₀, f)
+    ccall((:ft_orthogonal_polynomial_clenshawf, libfasttransforms), Cvoid, (Cint, Ptr{Float32}, Cint, Ptr{Float32}, Ptr{Float32}, Ptr{Float32}, Cint, Ptr{Float32}, Ptr{Float32}, Ptr{Float32}), N, c, stride(c, 1), A, B, C, length(x), x, ϕ₀, f)
+    f
+end
+
+@enum Transforms::Cint begin
+    LEG2CHEB=0
+    CHEB2LEG
+    ULTRA2ULTRA
+    JAC2JAC
+    LAG2LAG
+    JAC2ULTRA
+    ULTRA2JAC
+    JAC2CHEB
+    CHEB2JAC
+    ULTRA2CHEB
+    CHEB2ULTRA
+    ASSOCIATEDJAC2JAC
+    MODIFIEDJAC2JAC
+    MODIFIEDLAG2LAG
+    MODIFIEDHERM2HERM
+    SPHERE
+    SPHEREV
+    DISK
+    ANNULUS
+    RECTDISK
+    TRIANGLE
+    TETRAHEDRON
+    SPINSPHERE
+    SPHERESYNTHESIS
+    SPHEREANALYSIS
+    SPHEREVSYNTHESIS
+    SPHEREVANALYSIS
+    DISKSYNTHESIS
+    DISKANALYSIS
+    ANNULUSSYNTHESIS
+    ANNULUSANALYSIS
+    RECTDISKSYNTHESIS
+    RECTDISKANALYSIS
+    TRIANGLESYNTHESIS
+    TRIANGLEANALYSIS
+    TETRAHEDRONSYNTHESIS
+    TETRAHEDRONANALYSIS
+    SPINSPHERESYNTHESIS
+    SPINSPHEREANALYSIS
+    SPHERICALISOMETRY
+end
+
+Transforms(t::Transforms) = t
 
 let k2s = Dict(LEG2CHEB             => "Legendre--Chebyshev",
                CHEB2LEG             => "Chebyshev--Legendre",
@@ -76,23 +153,37 @@ let k2s = Dict(LEG2CHEB             => "Legendre--Chebyshev",
                CHEB2JAC             => "Chebyshev--Jacobi",
                ULTRA2CHEB           => "ultraspherical--Chebyshev",
                CHEB2ULTRA           => "Chebyshev--ultraspherical",
+               ASSOCIATEDJAC2JAC    => "Associated Jacobi--Jacobi",
+               MODIFIEDJAC2JAC      => "Modified Jacobi--Jacobi",
+               MODIFIEDLAG2LAG      => "Modified Laguerre--Laguerre",
+               MODIFIEDHERM2HERM    => "Modified Hermite--Hermite",
                SPHERE               => "Spherical harmonic--Fourier",
                SPHEREV              => "Spherical vector field--Fourier",
                DISK                 => "Zernike--Chebyshev×Fourier",
+               ANNULUS              => "Annulus--Chebyshev×Fourier",
+               RECTDISK             => "Dunkl-Xu--Chebyshev²",
                TRIANGLE             => "Proriol--Chebyshev²",
                TETRAHEDRON          => "Proriol--Chebyshev³",
+               SPINSPHERE           => "Spin-weighted spherical harmonic--Fourier",
                SPHERESYNTHESIS      => "FFTW Fourier synthesis on the sphere",
                SPHEREANALYSIS       => "FFTW Fourier analysis on the sphere",
                SPHEREVSYNTHESIS     => "FFTW Fourier synthesis on the sphere (vector field)",
                SPHEREVANALYSIS      => "FFTW Fourier analysis on the sphere (vector field)",
                DISKSYNTHESIS        => "FFTW Chebyshev×Fourier synthesis on the disk",
                DISKANALYSIS         => "FFTW Chebyshev×Fourier analysis on the disk",
+               ANNULUSSYNTHESIS     => "FFTW Chebyshev×Fourier synthesis on the annulus",
+               ANNULUSANALYSIS      => "FFTW Chebyshev×Fourier analysis on the annulus",
+               RECTDISKSYNTHESIS    => "FFTW Chebyshev synthesis on the rectangularized disk",
+               RECTDISKANALYSIS     => "FFTW Chebyshev analysis on the rectangularized disk",
                TRIANGLESYNTHESIS    => "FFTW Chebyshev synthesis on the triangle",
                TRIANGLEANALYSIS     => "FFTW Chebyshev analysis on the triangle",
                TETRAHEDRONSYNTHESIS => "FFTW Chebyshev synthesis on the tetrahedron",
-               TETRAHEDRONANALYSIS  => "FFTW Chebyshev analysis on the tetrahedron")
+               TETRAHEDRONANALYSIS  => "FFTW Chebyshev analysis on the tetrahedron",
+               SPINSPHERESYNTHESIS  => "FFTW Fourier synthesis on the sphere (spin-weighted)",
+               SPINSPHEREANALYSIS   => "FFTW Fourier analysis on the sphere (spin-weighted)",
+               SPHERICALISOMETRY    => "Spherical isometry")
     global kind2string
-    kind2string(k::Integer) = k2s[Int(k)]
+    kind2string(k::Union{Integer, Transforms}) = k2s[Transforms(k)]
 end
 
 struct ft_plan_struct end
@@ -125,18 +216,36 @@ show(io::IO, p::FTPlan{T, 1, K}) where {T, K} = print(io, "FastTransforms ", kin
 show(io::IO, p::FTPlan{T, 2, SPHERE}) where T = print(io, "FastTransforms ", kind2string(SPHERE), " plan for $(p.n)×$(2p.n-1)-element array of ", T)
 show(io::IO, p::FTPlan{T, 2, SPHEREV}) where T = print(io, "FastTransforms ", kind2string(SPHEREV), " plan for $(p.n)×$(2p.n-1)-element array of ", T)
 show(io::IO, p::FTPlan{T, 2, DISK}) where T = print(io, "FastTransforms ", kind2string(DISK), " plan for $(p.n)×$(4p.n-3)-element array of ", T)
+show(io::IO, p::FTPlan{T, 2, ANNULUS}) where T = print(io, "FastTransforms ", kind2string(ANNULUS), " plan for $(p.n)×$(4p.n-3)-element array of ", T)
+show(io::IO, p::FTPlan{T, 2, RECTDISK}) where T = print(io, "FastTransforms ", kind2string(RECTDISK), " plan for $(p.n)×$(p.n)-element array of ", T)
 show(io::IO, p::FTPlan{T, 2, TRIANGLE}) where T = print(io, "FastTransforms ", kind2string(TRIANGLE), " plan for $(p.n)×$(p.n)-element array of ", T)
 show(io::IO, p::FTPlan{T, 3, TETRAHEDRON}) where T = print(io, "FastTransforms ", kind2string(TETRAHEDRON), " plan for $(p.n)×$(p.n)×$(p.n)-element array of ", T)
+show(io::IO, p::FTPlan{T, 2, SPINSPHERE}) where T = print(io, "FastTransforms ", kind2string(SPINSPHERE), " plan for $(p.n)×$(2p.n-1)-element array of ", T)
 show(io::IO, p::FTPlan{T, 2, K}) where {T, K} = print(io, "FastTransforms plan for ", kind2string(K), " for $(p.n)×$(p.m)-element array of ", T)
 show(io::IO, p::FTPlan{T, 3, K}) where {T, K} = print(io, "FastTransforms plan for ", kind2string(K), " for $(p.n)×$(p.l)×$(p.m)-element array of ", T)
+show(io::IO, p::FTPlan{T, 2, SPHERICALISOMETRY}) where T = print(io, "FastTransforms ", kind2string(SPHERICALISOMETRY), " plan for $(p.n)×$(2p.n-1)-element array of ", T)
 
-function checksize(p::FTPlan{T}, x::Array{T}) where T
+function checksize(p::FTPlan{T, 1}, x::StridedArray{T}) where T
     if p.n != size(x, 1)
         throw(DimensionMismatch("FTPlan has dimensions $(p.n) × $(p.n), x has leading dimension $(size(x, 1))"))
     end
 end
 
-for K in (SPHERE, SPHEREV, DISK)
+function checkstride(p::FTPlan{T, 1}, x::StridedArray{T}) where T
+    if stride(x, 1) != 1
+        error("FTPlan requires unit stride in the leading dimension, x has stride $(stride(x, 1)) in the leading dimension.")
+    end
+end
+
+for (N, K) in ((2, RECTDISK), (2, TRIANGLE), (3, TETRAHEDRON))
+    @eval function checksize(p::FTPlan{T, $N, $K}, x::Array{T, $N}) where T
+        if p.n != size(x, 1)
+            throw(DimensionMismatch("FTPlan has dimensions $(p.n) × $(p.n), x has leading dimension $(size(x, 1))"))
+        end
+    end
+end
+
+for K in (SPHERE, SPHEREV, DISK, ANNULUS, SPINSPHERE)
     @eval function checksize(p::FTPlan{T, 2, $K}, x::Matrix{T}) where T
         if p.n != size(x, 1)
             throw(DimensionMismatch("FTPlan has dimensions $(p.n) × $(p.n), x has leading dimension $(size(x, 1))"))
@@ -147,87 +256,199 @@ for K in (SPHERE, SPHEREV, DISK)
     end
 end
 
+function checksize(p::FTPlan{T, 2}, x::Array{T, 2}) where T
+    if p.n != size(x, 1) || p.m != size(x, 2)
+        throw(DimensionMismatch("FTPlan has dimensions $(p.n) × $(p.m), x has dimensions $(size(x, 1)) × $(size(x, 2))"))
+    end
+end
+
+function checksize(p::FTPlan{T, 3}, x::Array{T, 3}) where T
+    if p.n != size(x, 1) || p.l != size(x, 2) || p.m != size(x, 3)
+        throw(DimensionMismatch("FTPlan has dimensions $(p.n) × $(p.l) × $(p.m), x has dimensions $(size(x, 1)) × $(size(x, 2)) × $(size(x, 3))"))
+    end
+end
+
+function checksize(p::FTPlan{T, 2, SPHERICALISOMETRY}, x::Matrix{T}) where T
+    if p.n != size(x, 1) || 2p.n-1 != size(x, 2)
+        throw(DimensionMismatch("This FTPlan must operate on arrays of size $(p.n) × $(2p.n-1)."))
+    end
+end
+
 unsafe_convert(::Type{Ptr{ft_plan_struct}}, p::FTPlan) = p.plan
 unsafe_convert(::Type{Ptr{mpfr_t}}, p::FTPlan) = unsafe_convert(Ptr{mpfr_t}, p.plan)
 
 destroy_plan(p::FTPlan{Float32, 1}) = ccall((:ft_destroy_tb_eigen_FMMf, libfasttransforms), Cvoid, (Ptr{ft_plan_struct}, ), p)
 destroy_plan(p::FTPlan{Float64, 1}) = ccall((:ft_destroy_tb_eigen_FMM, libfasttransforms), Cvoid, (Ptr{ft_plan_struct}, ), p)
 destroy_plan(p::FTPlan{BigFloat, 1}) = ccall((:ft_mpfr_destroy_plan, libfasttransforms), Cvoid, (Ptr{mpfr_t}, Cint), p, p.n)
-destroy_plan(p::FTPlan{Float64, 2}) = ccall((:ft_destroy_harmonic_plan, libfasttransforms), Cvoid, (Ptr{ft_plan_struct}, ), p)
-destroy_plan(p::FTPlan{Float64, 3}) = ccall((:ft_destroy_tetrahedral_harmonic_plan, libfasttransforms), Cvoid, (Ptr{ft_plan_struct}, ), p)
+destroy_plan(p::FTPlan{Float32, 1, ASSOCIATEDJAC2JAC}) = ccall((:ft_destroy_btb_eigen_FMMf, libfasttransforms), Cvoid, (Ptr{ft_plan_struct}, ), p)
+destroy_plan(p::FTPlan{Float64, 1, ASSOCIATEDJAC2JAC}) = ccall((:ft_destroy_btb_eigen_FMM, libfasttransforms), Cvoid, (Ptr{ft_plan_struct}, ), p)
+destroy_plan(p::FTPlan{Float32, 1, MODIFIEDJAC2JAC}) = ccall((:ft_destroy_modified_planf, libfasttransforms), Cvoid, (Ptr{ft_plan_struct}, ), p)
+destroy_plan(p::FTPlan{Float64, 1, MODIFIEDJAC2JAC}) = ccall((:ft_destroy_modified_plan, libfasttransforms), Cvoid, (Ptr{ft_plan_struct}, ), p)
+destroy_plan(p::FTPlan{Float32, 1, MODIFIEDLAG2LAG}) = ccall((:ft_destroy_modified_planf, libfasttransforms), Cvoid, (Ptr{ft_plan_struct}, ), p)
+destroy_plan(p::FTPlan{Float64, 1, MODIFIEDLAG2LAG}) = ccall((:ft_destroy_modified_plan, libfasttransforms), Cvoid, (Ptr{ft_plan_struct}, ), p)
+destroy_plan(p::FTPlan{Float32, 1, MODIFIEDHERM2HERM}) = ccall((:ft_destroy_modified_planf, libfasttransforms), Cvoid, (Ptr{ft_plan_struct}, ), p)
+destroy_plan(p::FTPlan{Float64, 1, MODIFIEDHERM2HERM}) = ccall((:ft_destroy_modified_plan, libfasttransforms), Cvoid, (Ptr{ft_plan_struct}, ), p)
+destroy_plan(p::FTPlan{Float64}) = ccall((:ft_destroy_harmonic_plan, libfasttransforms), Cvoid, (Ptr{ft_plan_struct}, ), p)
+destroy_plan(p::FTPlan{Complex{Float64}, 2, SPINSPHERE}) = ccall((:ft_destroy_spin_harmonic_plan, libfasttransforms), Cvoid, (Ptr{ft_plan_struct}, ), p)
 destroy_plan(p::FTPlan{Float64, 2, SPHERESYNTHESIS}) = ccall((:ft_destroy_sphere_fftw_plan, libfasttransforms), Cvoid, (Ptr{ft_plan_struct}, ), p)
 destroy_plan(p::FTPlan{Float64, 2, SPHEREANALYSIS}) = ccall((:ft_destroy_sphere_fftw_plan, libfasttransforms), Cvoid, (Ptr{ft_plan_struct}, ), p)
 destroy_plan(p::FTPlan{Float64, 2, SPHEREVSYNTHESIS}) = ccall((:ft_destroy_sphere_fftw_plan, libfasttransforms), Cvoid, (Ptr{ft_plan_struct}, ), p)
 destroy_plan(p::FTPlan{Float64, 2, SPHEREVANALYSIS}) = ccall((:ft_destroy_sphere_fftw_plan, libfasttransforms), Cvoid, (Ptr{ft_plan_struct}, ), p)
 destroy_plan(p::FTPlan{Float64, 2, DISKSYNTHESIS}) = ccall((:ft_destroy_disk_fftw_plan, libfasttransforms), Cvoid, (Ptr{ft_plan_struct}, ), p)
 destroy_plan(p::FTPlan{Float64, 2, DISKANALYSIS}) = ccall((:ft_destroy_disk_fftw_plan, libfasttransforms), Cvoid, (Ptr{ft_plan_struct}, ), p)
+destroy_plan(p::FTPlan{Float64, 2, ANNULUSSYNTHESIS}) = ccall((:ft_destroy_annulus_fftw_plan, libfasttransforms), Cvoid, (Ptr{ft_plan_struct}, ), p)
+destroy_plan(p::FTPlan{Float64, 2, ANNULUSANALYSIS}) = ccall((:ft_destroy_annulus_fftw_plan, libfasttransforms), Cvoid, (Ptr{ft_plan_struct}, ), p)
+destroy_plan(p::FTPlan{Float64, 2, RECTDISKSYNTHESIS}) = ccall((:ft_destroy_rectdisk_fftw_plan, libfasttransforms), Cvoid, (Ptr{ft_plan_struct}, ), p)
+destroy_plan(p::FTPlan{Float64, 2, RECTDISKANALYSIS}) = ccall((:ft_destroy_rectdisk_fftw_plan, libfasttransforms), Cvoid, (Ptr{ft_plan_struct}, ), p)
 destroy_plan(p::FTPlan{Float64, 2, TRIANGLESYNTHESIS}) = ccall((:ft_destroy_triangle_fftw_plan, libfasttransforms), Cvoid, (Ptr{ft_plan_struct}, ), p)
 destroy_plan(p::FTPlan{Float64, 2, TRIANGLEANALYSIS}) = ccall((:ft_destroy_triangle_fftw_plan, libfasttransforms), Cvoid, (Ptr{ft_plan_struct}, ), p)
 destroy_plan(p::FTPlan{Float64, 3, TETRAHEDRONSYNTHESIS}) = ccall((:ft_destroy_tetrahedron_fftw_plan, libfasttransforms), Cvoid, (Ptr{ft_plan_struct}, ), p)
 destroy_plan(p::FTPlan{Float64, 3, TETRAHEDRONANALYSIS}) = ccall((:ft_destroy_tetrahedron_fftw_plan, libfasttransforms), Cvoid, (Ptr{ft_plan_struct}, ), p)
+destroy_plan(p::FTPlan{Complex{Float64}, 2, SPINSPHERESYNTHESIS}) = ccall((:ft_destroy_spinsphere_fftw_plan, libfasttransforms), Cvoid, (Ptr{ft_plan_struct}, ), p)
+destroy_plan(p::FTPlan{Complex{Float64}, 2, SPINSPHEREANALYSIS}) = ccall((:ft_destroy_spinsphere_fftw_plan, libfasttransforms), Cvoid, (Ptr{ft_plan_struct}, ), p)
+destroy_plan(p::FTPlan{Float64, 2, SPHERICALISOMETRY}) = ccall((:ft_destroy_sph_isometry_plan, libfasttransforms), Cvoid, (Ptr{ft_plan_struct}, ), p)
 
-struct AdjointFTPlan{T, S}
+struct AdjointFTPlan{T, S, R}
     parent::S
+    adjoint::R
+    function AdjointFTPlan{T, S, R}(parent::S) where {T, S, R}
+        new(parent)
+    end
+    function AdjointFTPlan{T, S, R}(parent::S, adjoint::R) where {T, S, R}
+        new(parent, adjoint)
+    end
 end
 
-AdjointFTPlan(p::FTPlan) = AdjointFTPlan{eltype(p), typeof(p)}(p)
+AdjointFTPlan(p::FTPlan) = AdjointFTPlan{eltype(p), typeof(p), typeof(p)}(p)
+AdjointFTPlan(p::FTPlan, q::FTPlan) = AdjointFTPlan{eltype(q), typeof(p), typeof(q)}(p, q)
 
 adjoint(p::FTPlan) = AdjointFTPlan(p)
 adjoint(p::AdjointFTPlan) = p.parent
 
-eltype(p::AdjointFTPlan{T, S}) where {T, S} = T
-ndims(p::AdjointFTPlan{T, S}) where {T, S} = ndims(p.parent)
-function show(io::IO, p::AdjointFTPlan{T, S}) where {T, S}
+eltype(p::AdjointFTPlan{T}) where T = T
+ndims(p::AdjointFTPlan) = ndims(p.parent)
+function show(io::IO, p::AdjointFTPlan)
     print(io, "Adjoint ")
     show(io, p.parent)
 end
 
-checksize(p::AdjointFTPlan, x) = checksize(p.parent, x)
+function checksize(p::AdjointFTPlan, x)
+    try
+        checksize(p.adjoint, x)
+    catch
+        checksize(p.parent, x)
+    end
+end
+
+function checkstride(p::AdjointFTPlan, x)
+    try
+        checkstride(p.adjoint, x)
+    catch
+        checkstride(p.parent, x)
+    end
+end
+
+function unsafe_convert(::Type{Ptr{ft_plan_struct}}, p::AdjointFTPlan)
+    try
+        unsafe_convert(Ptr{ft_plan_struct}, p.adjoint)
+    catch
+        unsafe_convert(Ptr{ft_plan_struct}, p.parent)
+    end
+end
 
-unsafe_convert(::Type{Ptr{ft_plan_struct}}, p::AdjointFTPlan{T, FTPlan{T, N, K}}) where {T, N, K} = unsafe_convert(Ptr{ft_plan_struct}, p.parent)
-unsafe_convert(::Type{Ptr{mpfr_t}}, p::AdjointFTPlan{T, FTPlan{T, N, K}}) where {T, N, K} = unsafe_convert(Ptr{mpfr_t}, p.parent)
+function unsafe_convert(::Type{Ptr{mpfr_t}}, p::AdjointFTPlan)
+    try
+        unsafe_convert(Ptr{mpfr_t}, p.adjoint)
+    catch
+        unsafe_convert(Ptr{mpfr_t}, p.parent)
+    end
+end
 
-struct TransposeFTPlan{T, S}
+struct TransposeFTPlan{T, S, R}
     parent::S
+    transpose::R
+    function TransposeFTPlan{T, S, R}(parent::S) where {T, S, R}
+        new(parent)
+    end
+    function TransposeFTPlan{T, S, R}(parent::S, transpose::R) where {T, S, R}
+        new(parent, transpose)
+    end
 end
 
-TransposeFTPlan(p::FTPlan) = TransposeFTPlan{eltype(p), typeof(p)}(p)
+TransposeFTPlan(p::FTPlan) = TransposeFTPlan{eltype(p), typeof(p), typeof(p)}(p)
+TransposeFTPlan(p::FTPlan, q::FTPlan) = TransposeFTPlan{eltype(q), typeof(p), typeof(q)}(p, q)
 
 transpose(p::FTPlan) = TransposeFTPlan(p)
 transpose(p::TransposeFTPlan) = p.parent
 
-eltype(p::TransposeFTPlan{T, S}) where {T, S} = T
-ndims(p::TransposeFTPlan{T, S}) where {T, S} = ndims(p.parent)
-function show(io::IO, p::TransposeFTPlan{T, S}) where {T, S}
+eltype(p::TransposeFTPlan{T}) where T = T
+ndims(p::TransposeFTPlan) = ndims(p.parent)
+function show(io::IO, p::TransposeFTPlan)
     print(io, "Transpose ")
     show(io, p.parent)
 end
 
-checksize(p::TransposeFTPlan, x) = checksize(p.parent, x)
+function checksize(p::TransposeFTPlan, x)
+    try
+        checksize(p.transpose, x)
+    catch
+        checksize(p.parent, x)
+    end
+end
 
-unsafe_convert(::Type{Ptr{ft_plan_struct}}, p::TransposeFTPlan{T, FTPlan{T, N, K}}) where {T, N, K} = unsafe_convert(Ptr{ft_plan_struct}, p.parent)
-unsafe_convert(::Type{Ptr{mpfr_t}}, p::TransposeFTPlan{T, FTPlan{T, N, K}}) where {T, N, K} = unsafe_convert(Ptr{mpfr_t}, p.parent)
+function checkstride(p::TransposeFTPlan, x)
+    try
+        checkstride(p.transpose, x)
+    catch
+        checkstride(p.parent, x)
+    end
+end
+
+function unsafe_convert(::Type{Ptr{ft_plan_struct}}, p::TransposeFTPlan)
+    try
+        unsafe_convert(Ptr{ft_plan_struct}, p.transpose)
+    catch
+        unsafe_convert(Ptr{ft_plan_struct}, p.parent)
+    end
+end
+
+function unsafe_convert(::Type{Ptr{mpfr_t}}, p::TransposeFTPlan)
+    try
+        unsafe_convert(Ptr{mpfr_t}, p.transpose)
+    catch
+        unsafe_convert(Ptr{mpfr_t}, p.parent)
+    end
+end
+
+const ModifiedFTPlan{T} = Union{FTPlan{T, 1, MODIFIEDJAC2JAC}, FTPlan{T, 1, MODIFIEDLAG2LAG}, FTPlan{T, 1, MODIFIEDHERM2HERM}}
 
 for f in (:leg2cheb, :cheb2leg, :ultra2ultra, :jac2jac,
           :lag2lag, :jac2ultra, :ultra2jac, :jac2cheb,
-          :cheb2jac, :ultra2cheb, :cheb2ultra,
-          :sph2fourier, :sphv2fourier, :disk2cxf, :tri2cheb, :tet2cheb)
+          :cheb2jac, :ultra2cheb, :cheb2ultra, :associatedjac2jac,
+          :modifiedjac2jac, :modifiedlag2lag, :modifiedherm2herm,
+          :sph2fourier, :sphv2fourier, :disk2cxf, :ann2cxf,
+          :rectdisk2cheb, :tri2cheb, :tet2cheb)
     plan_f = Symbol("plan_", f)
+    lib_f = Symbol("lib_", f)
     @eval begin
         $plan_f(x::AbstractArray{T}, y...; z...) where T = $plan_f(T, size(x, 1), y...; z...)
         $plan_f(::Type{Complex{T}}, y...; z...) where T <: Real = $plan_f(T, y...; z...)
-        $f(x::AbstractArray, y...; z...) = $plan_f(x, y...; z...)*x
+        $lib_f(x::AbstractArray, y...; z...) = $plan_f(x, y...; z...)*x
     end
 end
 
 for (f, plan_f) in ((:fourier2sph, :plan_sph2fourier), (:fourier2sphv, :plan_sphv2fourier),
-                    (:cxf2disk2, :plan_disk2cxf), (:cheb2tri, :plan_tri2cheb),
+                    (:cxf2disk, :plan_disk2cxf), (:cxf2ann, :plan_ann2cxf),
+                    (:cheb2rectdisk, :plan_rectdisk2cheb), (:cheb2tri, :plan_tri2cheb),
                     (:cheb2tet, :plan_tet2cheb))
     @eval begin
         $f(x::AbstractArray, y...; z...) = $plan_f(x, y...; z...)\x
     end
 end
 
+plan_spinsph2fourier(x::AbstractArray{T}, y...; z...) where T = plan_spinsph2fourier(T, size(x, 1), y...; z...)
+spinsph2fourier(x::AbstractArray, y...; z...) = plan_spinsph2fourier(x, y...; z...)*x
+fourier2spinsph(x::AbstractArray, y...; z...) = plan_spinsph2fourier(x, y...; z...)\x
 
 function plan_leg2cheb(::Type{Float32}, n::Integer; normleg::Bool=false, normcheb::Bool=false)
     plan = ccall((:ft_plan_legendre_to_chebyshevf, libfasttransforms), Ptr{ft_plan_struct}, (Cint, Cint, Cint), normleg, normcheb, n)
@@ -284,6 +505,41 @@ function plan_cheb2ultra(::Type{Float32}, n::Integer, λ; normcheb::Bool=false,
     return FTPlan{Float32, 1, CHEB2ULTRA}(plan, n)
 end
 
+function plan_associatedjac2jac(::Type{Float32}, n::Integer, c::Integer, α, β, γ, δ; norm1::Bool=false, norm2::Bool=false)
+    plan = ccall((:ft_plan_associated_jacobi_to_jacobif, libfasttransforms), Ptr{ft_plan_struct}, (Cint, Cint, Cint, Cint, Float32, Float32, Float32, Float32), norm1, norm2, n, c, α, β, γ, δ)
+    return FTPlan{Float32, 1, ASSOCIATEDJAC2JAC}(plan, n)
+end
+
+function plan_modifiedjac2jac(::Type{Float32}, n::Integer, α, β, u::Vector{Float32}; verbose::Bool=false)
+    plan = ccall((:ft_plan_modified_jacobi_to_jacobif, libfasttransforms), Ptr{ft_plan_struct}, (Cint, Float32, Float32, Cint, Ptr{Float32}, Cint, Ptr{Float32}, Cint), n, α, β, length(u), u, 0, C_NULL, verbose)
+    return FTPlan{Float32, 1, MODIFIEDJAC2JAC}(plan, n)
+end
+
+function plan_modifiedjac2jac(::Type{Float32}, n::Integer, α, β, u::Vector{Float32}, v::Vector{Float32}; verbose::Bool=false)
+    plan = ccall((:ft_plan_modified_jacobi_to_jacobif, libfasttransforms), Ptr{ft_plan_struct}, (Cint, Float32, Float32, Cint, Ptr{Float32}, Cint, Ptr{Float32}, Cint), n, α, β, length(u), u, length(v), v, verbose)
+    return FTPlan{Float32, 1, MODIFIEDJAC2JAC}(plan, n)
+end
+
+function plan_modifiedlag2lag(::Type{Float32}, n::Integer, α, u::Vector{Float32}; verbose::Bool=false)
+    plan = ccall((:ft_plan_modified_laguerre_to_laguerref, libfasttransforms), Ptr{ft_plan_struct}, (Cint, Float32, Cint, Ptr{Float32}, Cint, Ptr{Float32}, Cint), n, α, length(u), u, 0, C_NULL, verbose)
+    return FTPlan{Float32, 1, MODIFIEDLAG2LAG}(plan, n)
+end
+
+function plan_modifiedlag2lag(::Type{Float32}, n::Integer, α, u::Vector{Float32}, v::Vector{Float32}; verbose::Bool=false)
+    plan = ccall((:ft_plan_modified_laguerre_to_laguerref, libfasttransforms), Ptr{ft_plan_struct}, (Cint, Float32, Cint, Ptr{Float32}, Cint, Ptr{Float32}, Cint), n, α, length(u), u, length(v), v, verbose)
+    return FTPlan{Float32, 1, MODIFIEDLAG2LAG}(plan, n)
+end
+
+function plan_modifiedherm2herm(::Type{Float32}, n::Integer, u::Vector{Float32}; verbose::Bool=false)
+    plan = ccall((:ft_plan_modified_hermite_to_hermitef, libfasttransforms), Ptr{ft_plan_struct}, (Cint, Cint, Ptr{Float32}, Cint, Ptr{Float32}, Cint), n, length(u), u, 0, C_NULL, verbose)
+    return FTPlan{Float32, 1, MODIFIEDHERM2HERM}(plan, n)
+end
+
+function plan_modifiedherm2herm(::Type{Float32}, n::Integer, u::Vector{Float32}, v::Vector{Float32}; verbose::Bool=false)
+    plan = ccall((:ft_plan_modified_hermite_to_hermitef, libfasttransforms), Ptr{ft_plan_struct}, (Cint, Cint, Ptr{Float32}, Cint, Ptr{Float32}, Cint), n, length(u), u, length(v), v, verbose)
+    return FTPlan{Float32, 1, MODIFIEDHERM2HERM}(plan, n)
+end
+
 
 function plan_leg2cheb(::Type{Float64}, n::Integer; normleg::Bool=false, normcheb::Bool=false)
     plan = ccall((:ft_plan_legendre_to_chebyshev, libfasttransforms), Ptr{ft_plan_struct}, (Cint, Cint, Cint), normleg, normcheb, n)
@@ -340,6 +596,41 @@ function plan_cheb2ultra(::Type{Float64}, n::Integer, λ; normcheb::Bool=false,
     return FTPlan{Float64, 1, CHEB2ULTRA}(plan, n)
 end
 
+function plan_associatedjac2jac(::Type{Float64}, n::Integer, c::Integer, α, β, γ, δ; norm1::Bool=false, norm2::Bool=false)
+    plan = ccall((:ft_plan_associated_jacobi_to_jacobi, libfasttransforms), Ptr{ft_plan_struct}, (Cint, Cint, Cint, Cint, Float64, Float64, Float64, Float64), norm1, norm2, n, c, α, β, γ, δ)
+    return FTPlan{Float64, 1, ASSOCIATEDJAC2JAC}(plan, n)
+end
+
+function plan_modifiedjac2jac(::Type{Float64}, n::Integer, α, β, u::Vector{Float64}; verbose::Bool=false)
+    plan = ccall((:ft_plan_modified_jacobi_to_jacobi, libfasttransforms), Ptr{ft_plan_struct}, (Cint, Float64, Float64, Cint, Ptr{Float64}, Cint, Ptr{Float64}, Cint), n, α, β, length(u), u, 0, C_NULL, verbose)
+    return FTPlan{Float64, 1, MODIFIEDJAC2JAC}(plan, n)
+end
+
+function plan_modifiedjac2jac(::Type{Float64}, n::Integer, α, β, u::Vector{Float64}, v::Vector{Float64}; verbose::Bool=false)
+    plan = ccall((:ft_plan_modified_jacobi_to_jacobi, libfasttransforms), Ptr{ft_plan_struct}, (Cint, Float64, Float64, Cint, Ptr{Float64}, Cint, Ptr{Float64}, Cint), n, α, β, length(u), u, length(v), v, verbose)
+    return FTPlan{Float64, 1, MODIFIEDJAC2JAC}(plan, n)
+end
+
+function plan_modifiedlag2lag(::Type{Float64}, n::Integer, α, u::Vector{Float64}; verbose::Bool=false)
+    plan = ccall((:ft_plan_modified_laguerre_to_laguerre, libfasttransforms), Ptr{ft_plan_struct}, (Cint, Float64, Cint, Ptr{Float64}, Cint, Ptr{Float64}, Cint), n, α, length(u), u, 0, C_NULL, verbose)
+    return FTPlan{Float64, 1, MODIFIEDLAG2LAG}(plan, n)
+end
+
+function plan_modifiedlag2lag(::Type{Float64}, n::Integer, α, u::Vector{Float64}, v::Vector{Float64}; verbose::Bool=false)
+    plan = ccall((:ft_plan_modified_laguerre_to_laguerre, libfasttransforms), Ptr{ft_plan_struct}, (Cint, Float64, Cint, Ptr{Float64}, Cint, Ptr{Float64}, Cint), n, α, length(u), u, length(v), v, verbose)
+    return FTPlan{Float64, 1, MODIFIEDLAG2LAG}(plan, n)
+end
+
+function plan_modifiedherm2herm(::Type{Float64}, n::Integer, u::Vector{Float64}; verbose::Bool=false)
+    plan = ccall((:ft_plan_modified_hermite_to_hermite, libfasttransforms), Ptr{ft_plan_struct}, (Cint, Cint, Ptr{Float64}, Cint, Ptr{Float64}, Cint), n, length(u), u, 0, C_NULL, verbose)
+    return FTPlan{Float64, 1, MODIFIEDHERM2HERM}(plan, n)
+end
+
+function plan_modifiedherm2herm(::Type{Float64}, n::Integer, u::Vector{Float64}, v::Vector{Float64}; verbose::Bool=false)
+    plan = ccall((:ft_plan_modified_hermite_to_hermite, libfasttransforms), Ptr{ft_plan_struct}, (Cint, Cint, Ptr{Float64}, Cint, Ptr{Float64}, Cint), n, length(u), u, length(v), v, verbose)
+    return FTPlan{Float64, 1, MODIFIEDHERM2HERM}(plan, n)
+end
+
 
 function plan_leg2cheb(::Type{BigFloat}, n::Integer; normleg::Bool=false, normcheb::Bool=false)
     plan = ccall((:ft_mpfr_plan_legendre_to_chebyshev, libfasttransforms), Ptr{ft_plan_struct}, (Cint, Cint, Cint, Clong, Int32), normleg, normcheb, n, precision(BigFloat), Base.MPFR.ROUNDING_MODE[])
@@ -407,11 +698,21 @@ function plan_sphv2fourier(::Type{Float64}, n::Integer)
     return FTPlan{Float64, 2, SPHEREV}(plan, n)
 end
 
-function plan_disk2cxf(::Type{Float64}, n::Integer)
-    plan = ccall((:ft_plan_disk2cxf, libfasttransforms), Ptr{ft_plan_struct}, (Cint, ), n)
+function plan_disk2cxf(::Type{Float64}, n::Integer, α, β)
+    plan = ccall((:ft_plan_disk2cxf, libfasttransforms), Ptr{ft_plan_struct}, (Cint, Float64, Float64), n, α, β)
     return FTPlan{Float64, 2, DISK}(plan, n)
 end
 
+function plan_ann2cxf(::Type{Float64}, n::Integer, α, β, γ, ρ)
+    plan = ccall((:ft_plan_ann2cxf, libfasttransforms), Ptr{ft_plan_struct}, (Cint, Float64, Float64, Float64, Float64), n, α, β, γ, ρ)
+    return FTPlan{Float64, 2, ANNULUS}(plan, n)
+end
+
+function plan_rectdisk2cheb(::Type{Float64}, n::Integer, β)
+    plan = ccall((:ft_plan_rectdisk2cheb, libfasttransforms), Ptr{ft_plan_struct}, (Cint, Float64), n, β)
+    return FTPlan{Float64, 2, RECTDISK}(plan, n)
+end
+
 function plan_tri2cheb(::Type{Float64}, n::Integer, α, β, γ)
     plan = ccall((:ft_plan_tri2cheb, libfasttransforms), Ptr{ft_plan_struct}, (Cint, Float64, Float64, Float64), n, α, β, γ)
     return FTPlan{Float64, 2, TRIANGLE}(plan, n)
@@ -422,119 +723,279 @@ function plan_tet2cheb(::Type{Float64}, n::Integer, α, β, γ, δ)
     return FTPlan{Float64, 3, TETRAHEDRON}(plan, n)
 end
 
-for (fJ, fC, fE, K) in ((:plan_sph_synthesis, :ft_plan_sph_synthesis, :ft_execute_sph_synthesis, SPHERESYNTHESIS),
-                    (:plan_sph_analysis, :ft_plan_sph_analysis, :ft_execute_sph_analysis, SPHEREANALYSIS),
-                    (:plan_sphv_synthesis, :ft_plan_sphv_synthesis, :ft_execute_sphv_synthesis, SPHEREVSYNTHESIS),
-                    (:plan_sphv_analysis, :ft_plan_sphv_analysis, :ft_execute_sphv_analysis, SPHEREVANALYSIS),
-                    (:plan_disk_synthesis, :ft_plan_disk_synthesis, :ft_execute_disk_synthesis, DISKSYNTHESIS),
-                    (:plan_disk_analysis, :ft_plan_disk_analysis, :ft_execute_disk_analysis, DISKANALYSIS),
-                    (:plan_tri_synthesis, :ft_plan_tri_synthesis, :ft_execute_tri_synthesis, TRIANGLESYNTHESIS),
-                    (:plan_tri_analysis, :ft_plan_tri_analysis, :ft_execute_tri_analysis, TRIANGLEANALYSIS))
+function plan_spinsph2fourier(::Type{Complex{Float64}}, n::Integer, s::Integer)
+    plan = ccall((:ft_plan_spinsph2fourier, libfasttransforms), Ptr{ft_plan_struct}, (Cint, Cint), n, s)
+    return FTPlan{Complex{Float64}, 2, SPINSPHERE}(plan, n)
+end
+
+plan_disk2cxf(::Type{Float64}, n::Integer, α) = plan_disk2cxf(Float64, n, α, 0)
+plan_disk2cxf(::Type{Float64}, n::Integer) = plan_disk2cxf(Float64, n, 0)
+plan_ann2cxf(::Type{Float64}, n::Integer, α, β, γ) = plan_ann2cxf(Float64, n, α, β, γ, 0)
+plan_ann2cxf(::Type{Float64}, n::Integer, α, β) = plan_disk2cxf(Float64, n, α, β)
+plan_ann2cxf(::Type{Float64}, n::Integer, α) = plan_disk2cxf(Float64, n, α)
+plan_ann2cxf(::Type{Float64}, n::Integer) = plan_disk2cxf(Float64, n)
+plan_rectdisk2cheb(::Type{Float64}, n::Integer) = plan_rectdisk2cheb(Float64, n, 0)
+plan_tri2cheb(::Type{Float64}, n::Integer, α, β) = plan_tri2cheb(Float64, n, α, β, 0)
+plan_tri2cheb(::Type{Float64}, n::Integer, α) = plan_tri2cheb(Float64, n, α, 0)
+plan_tri2cheb(::Type{Float64}, n::Integer) = plan_tri2cheb(Float64, n, 0)
+plan_tet2cheb(::Type{Float64}, n::Integer, α, β, γ) = plan_tet2cheb(Float64, n, α, β, γ, 0)
+plan_tet2cheb(::Type{Float64}, n::Integer, α, β) = plan_tet2cheb(Float64, n, α, β, 0)
+plan_tet2cheb(::Type{Float64}, n::Integer, α) = plan_tet2cheb(Float64, n, α, 0)
+plan_tet2cheb(::Type{Float64}, n::Integer) = plan_tet2cheb(Float64, n, 0)
+
+for (fJ, fadJ, fC, fE, K) in ((:plan_sph_synthesis, :plan_sph_analysis, :ft_plan_sph_synthesis, :ft_execute_sph_synthesis, SPHERESYNTHESIS),
+                              (:plan_sph_analysis, :plan_sph_synthesis, :ft_plan_sph_analysis, :ft_execute_sph_analysis, SPHEREANALYSIS),
+                              (:plan_sphv_synthesis, :plan_sphv_analysis, :ft_plan_sphv_synthesis, :ft_execute_sphv_synthesis, SPHEREVSYNTHESIS),
+                              (:plan_sphv_analysis, :plan_sphv_synthesis, :ft_plan_sphv_analysis, :ft_execute_sphv_analysis, SPHEREVANALYSIS),
+                              (:plan_disk_synthesis, :plan_disk_analysis, :ft_plan_disk_synthesis, :ft_execute_disk_synthesis, DISKSYNTHESIS),
+                              (:plan_disk_analysis, :plan_disk_synthesis, :ft_plan_disk_analysis, :ft_execute_disk_analysis, DISKANALYSIS),
+                              (:plan_rectdisk_synthesis, :plan_rectdisk_analysis, :ft_plan_rectdisk_synthesis, :ft_execute_rectdisk_synthesis, RECTDISKSYNTHESIS),
+                              (:plan_rectdisk_analysis, :plan_rectdisk_synthesis, :ft_plan_rectdisk_analysis, :ft_execute_rectdisk_analysis, RECTDISKANALYSIS),
+                              (:plan_tri_synthesis, :plan_tri_analysis, :ft_plan_tri_synthesis, :ft_execute_tri_synthesis, TRIANGLESYNTHESIS),
+                              (:plan_tri_analysis, :plan_tri_synthesis, :ft_plan_tri_analysis, :ft_execute_tri_analysis, TRIANGLEANALYSIS))
     @eval begin
-        $fJ(x::Matrix{T}) where T = $fJ(T, size(x, 1), size(x, 2))
-        $fJ(::Type{Complex{T}}, x...) where T <: Real = $fJ(T, x...)
-        function $fJ(::Type{Float64}, n::Integer, m::Integer)
-            plan = ccall(($(string(fC)), libfasttransforms), Ptr{ft_plan_struct}, (Cint, Cint), n, m)
+        $fJ(x::Matrix{T}; y...) where T = $fJ(T, size(x, 1), size(x, 2); y...)
+        $fJ(::Type{Complex{T}}, x...; y...) where T <: Real = $fJ(T, x...; y...)
+        function $fJ(::Type{Float64}, n::Integer, m::Integer; flags::Integer=FFTW.ESTIMATE)
+            plan = ccall(($(string(fC)), libfasttransforms), Ptr{ft_plan_struct}, (Cint, Cint, Cuint), n, m, flags)
             return FTPlan{Float64, 2, $K}(plan, n, m)
         end
+        adjoint(p::FTPlan{T, 2, $K}) where T = AdjointFTPlan(p, $fadJ(T, p.n, p.m))
+        transpose(p::FTPlan{T, 2, $K}) where T = TransposeFTPlan(p, $fadJ(T, p.n, p.m))
         function lmul!(p::FTPlan{Float64, 2, $K}, x::Matrix{Float64})
-            if p.n != size(x, 1) || p.m != size(x, 2)
-                throw(DimensionMismatch("FTPlan has dimensions $(p.n) × $(p.m), x has dimensions $(size(x, 1)) × $(size(x, 2))"))
-            end
-            ccall(($(string(fE)), libfasttransforms), Cvoid, (Ptr{ft_plan_struct}, Ptr{Float64}, Cint, Cint), p, x, size(x, 1), size(x, 2))
+            checksize(p, x)
+            ccall(($(string(fE)), libfasttransforms), Cvoid, (Cint, Ptr{ft_plan_struct}, Ptr{Float64}, Cint, Cint), 'N', p, x, size(x, 1), size(x, 2))
+            return x
+        end
+        function lmul!(p::AdjointFTPlan{Float64, FTPlan{Float64, 2, $K}}, x::Matrix{Float64})
+            checksize(p, x)
+            ccall(($(string(fE)), libfasttransforms), Cvoid, (Cint, Ptr{ft_plan_struct}, Ptr{Float64}, Cint, Cint), 'T', p, x, size(x, 1), size(x, 2))
+            return x
+        end
+        function lmul!(p::TransposeFTPlan{Float64, FTPlan{Float64, 2, $K}}, x::Matrix{Float64})
+            checksize(p, x)
+            ccall(($(string(fE)), libfasttransforms), Cvoid, (Cint, Ptr{ft_plan_struct}, Ptr{Float64}, Cint, Cint), 'T', p, x, size(x, 1), size(x, 2))
             return x
         end
     end
 end
 
-plan_tet_synthesis(x::Array{T, 3}) where T = plan_tet_synthesis(T, size(x, 1), size(x, 2), size(x, 3))
-plan_tet_synthesis(::Type{Complex{T}}, x...) where T <: Real = plan_tet_synthesis(T, x...)
+ft_get_rho_annulus_fftw_plan(p::FTPlan{Float64, 2, ANNULUSSYNTHESIS}) = ccall((:ft_get_rho_annulus_fftw_plan, libfasttransforms), Float64, (Ptr{ft_plan_struct}, ), p)
+ft_get_rho_annulus_fftw_plan(p::FTPlan{Float64, 2, ANNULUSANALYSIS}) = ccall((:ft_get_rho_annulus_fftw_plan, libfasttransforms), Float64, (Ptr{ft_plan_struct}, ), p)
 
-function plan_tet_synthesis(::Type{Float64}, n::Integer, l::Integer, m::Integer)
-    plan = ccall((:ft_plan_tet_synthesis, libfasttransforms), Ptr{ft_plan_struct}, (Cint, Cint, Cint), n, l, m)
-    return FTPlan{Float64, 3, TETRAHEDRONSYNTHESIS}(plan, n, l, m)
+for (fJ, fadJ, fC, fE, K) in ((:plan_annulus_synthesis, :plan_annulus_analysis, :ft_plan_annulus_synthesis, :ft_execute_annulus_synthesis, ANNULUSSYNTHESIS),
+                              (:plan_annulus_analysis, :plan_annulus_synthesis, :ft_plan_annulus_analysis, :ft_execute_annulus_analysis, ANNULUSANALYSIS))
+    @eval begin
+        $fJ(x::Matrix{T}, ρ; y...) where T = $fJ(T, size(x, 1), size(x, 2), ρ; y...)
+        $fJ(::Type{Complex{T}}, x...; y...) where T <: Real = $fJ(T, x...; y...)
+        function $fJ(::Type{Float64}, n::Integer, m::Integer, ρ; flags::Integer=FFTW.ESTIMATE)
+            plan = ccall(($(string(fC)), libfasttransforms), Ptr{ft_plan_struct}, (Cint, Cint, Float64, Cuint), n, m, ρ, flags)
+            return FTPlan{Float64, 2, $K}(plan, n, m)
+        end
+        adjoint(p::FTPlan{T, 2, $K}) where T = AdjointFTPlan(p, $fadJ(T, p.n, p.m, ft_get_rho_annulus_fftw_plan(p)))
+        transpose(p::FTPlan{T, 2, $K}) where T = TransposeFTPlan(p, $fadJ(T, p.n, p.m, ft_get_rho_annulus_fftw_plan(p)))
+        function lmul!(p::FTPlan{Float64, 2, $K}, x::Matrix{Float64})
+            checksize(p, x)
+            ccall(($(string(fE)), libfasttransforms), Cvoid, (Cint, Ptr{ft_plan_struct}, Ptr{Float64}, Cint, Cint), 'N', p, x, size(x, 1), size(x, 2))
+            return x
+        end
+        function lmul!(p::AdjointFTPlan{Float64, FTPlan{Float64, 2, $K}}, x::Matrix{Float64})
+            checksize(p, x)
+            ccall(($(string(fE)), libfasttransforms), Cvoid, (Cint, Ptr{ft_plan_struct}, Ptr{Float64}, Cint, Cint), 'T', p, x, size(x, 1), size(x, 2))
+            return x
+        end
+        function lmul!(p::TransposeFTPlan{Float64, FTPlan{Float64, 2, $K}}, x::Matrix{Float64})
+            checksize(p, x)
+            ccall(($(string(fE)), libfasttransforms), Cvoid, (Cint, Ptr{ft_plan_struct}, Ptr{Float64}, Cint, Cint), 'T', p, x, size(x, 1), size(x, 2))
+            return x
+        end
+    end
 end
 
-function lmul!(p::FTPlan{Float64, 3, TETRAHEDRONSYNTHESIS}, x::Array{Float64, 3})
-    if p.n != size(x, 1) || p.l != size(x, 2) || p.m != size(x, 3)
-        throw(DimensionMismatch("FTPlan has dimensions $(p.n) × $(p.l) × $(p.m), x has dimensions $(size(x, 1)) × $(size(x, 2)) × $(size(x, 3))"))
+for (fJ, fadJ, fC, fE, K) in ((:plan_tet_synthesis, :plan_tet_analysis, :ft_plan_tet_synthesis, :ft_execute_tet_synthesis, TETRAHEDRONSYNTHESIS),
+                              (:plan_tet_analysis, :plan_tet_synthesis, :ft_plan_tet_analysis, :ft_execute_tet_analysis, TETRAHEDRONANALYSIS))
+    @eval begin
+        $fJ(x::Array{T, 3}; y...) where T = $fJ(T, size(x, 1), size(x, 2), size(x, 3); y...)
+        $fJ(::Type{Complex{T}}, x...; y...) where T <: Real = $fJ(T, x...; y...)
+        function $fJ(::Type{Float64}, n::Integer, l::Integer, m::Integer; flags::Integer=FFTW.ESTIMATE)
+            plan = ccall(($(string(fC)), libfasttransforms), Ptr{ft_plan_struct}, (Cint, Cint, Cint, Cuint), n, l, m, flags)
+            return FTPlan{Float64, 3, $K}(plan, n, l, m)
+        end
+        adjoint(p::FTPlan{T, 3, $K}) where T = AdjointFTPlan(p, $fadJ(T, p.n, p.l, p.m))
+        transpose(p::FTPlan{T, 3, $K}) where T = TransposeFTPlan(p, $fadJ(T, p.n, p.l, p.m))
+        function lmul!(p::FTPlan{Float64, 3, $K}, x::Array{Float64, 3})
+            checksize(p, x)
+            ccall(($(string(fE)), libfasttransforms), Cvoid, (Cint, Ptr{ft_plan_struct}, Ptr{Float64}, Cint, Cint, Cint), 'N', p, x, size(x, 1), size(x, 2), size(x, 3))
+            return x
+        end
+        function lmul!(p::AdjointFTPlan{Float64, FTPlan{Float64, 3, $K}}, x::Array{Float64, 3})
+            checksize(p, x)
+            ccall(($(string(fE)), libfasttransforms), Cvoid, (Cint, Ptr{ft_plan_struct}, Ptr{Float64}, Cint, Cint, Cint), 'T', p, x, size(x, 1), size(x, 2), size(x, 3))
+            return x
+        end
+        function lmul!(p::TransposeFTPlan{Float64, FTPlan{Float64, 3, $K}}, x::Array{Float64, 3})
+            checksize(p, x)
+            ccall(($(string(fE)), libfasttransforms), Cvoid, (Cint, Ptr{ft_plan_struct}, Ptr{Float64}, Cint, Cint, Cint), 'T', p, x, size(x, 1), size(x, 2), size(x, 3))
+            return x
+        end
     end
-    ccall((:ft_execute_tet_synthesis, libfasttransforms), Cvoid, (Ptr{ft_plan_struct}, Ptr{Float64}, Cint, Cint, Cint), p, x, size(x, 1), size(x, 2), size(x, 3))
-    return x
 end
 
-plan_tet_analysis(x::Array{T, 3}) where T = plan_tet_analysis(T, size(x, 1), size(x, 2), size(x, 3))
-plan_tet_analysis(::Type{Complex{T}}, x...) where T <: Real = plan_tet_analysis(T, x...)
-
-function plan_tet_analysis(::Type{Float64}, n::Integer, l::Integer, m::Integer)
-    plan = ccall((:ft_plan_tet_analysis, libfasttransforms), Ptr{ft_plan_struct}, (Cint, Cint, Cint), n, l, m)
-    return FTPlan{Float64, 3, TETRAHEDRONANALYSIS}(plan, n, l, m)
+for (fJ, fadJ, fC, fE, K) in ((:plan_spinsph_synthesis, :plan_spinsph_analysis, :ft_plan_spinsph_synthesis, :ft_execute_spinsph_synthesis, SPINSPHERESYNTHESIS),
+                              (:plan_spinsph_analysis, :plan_spinsph_synthesis, :ft_plan_spinsph_analysis, :ft_execute_spinsph_analysis, SPINSPHEREANALYSIS))
+    @eval begin
+        $fJ(x::Matrix{T}, s::Integer; y...) where T = $fJ(T, size(x, 1), size(x, 2), s; y...)
+        function $fJ(::Type{Complex{Float64}}, n::Integer, m::Integer, s::Integer; flags::Integer=FFTW.ESTIMATE)
+            plan = ccall(($(string(fC)), libfasttransforms), Ptr{ft_plan_struct}, (Cint, Cint, Cint, Cuint), n, m, s, flags)
+            return FTPlan{Complex{Float64}, 2, $K}(plan, n, m)
+        end
+        get_spin(p::FTPlan{T, 2, $K}) where T = ccall((:ft_get_spin_spinsphere_fftw_plan, libfasttransforms), Cint, (Ptr{ft_plan_struct},), p)
+        adjoint(p::FTPlan{T, 2, $K}) where T = AdjointFTPlan(p, $fadJ(T, p.n, p.m, get_spin(p)))
+        transpose(p::FTPlan{T, 2, $K}) where T = TransposeFTPlan(p, $fadJ(T, p.n, p.m, get_spin(p)))
+        function lmul!(p::FTPlan{Complex{Float64}, 2, $K}, x::Matrix{Complex{Float64}})
+            checksize(p, x)
+            ccall(($(string(fE)), libfasttransforms), Cvoid, (Cint, Ptr{ft_plan_struct}, Ptr{Float64}, Cint, Cint), 'N', p, x, size(x, 1), size(x, 2))
+            return x
+        end
+        function lmul!(p::AdjointFTPlan{Complex{Float64}, FTPlan{Complex{Float64}, 2, $K}}, x::Matrix{Complex{Float64}})
+            checksize(p, x)
+            ccall(($(string(fE)), libfasttransforms), Cvoid, (Cint, Ptr{ft_plan_struct}, Ptr{Float64}, Cint, Cint), 'C', p, x, size(x, 1), size(x, 2))
+            return x
+        end
+        function lmul!(p::TransposeFTPlan{Complex{Float64}, FTPlan{Complex{Float64}, 2, $K}}, x::Matrix{Complex{Float64}})
+            checksize(p, x)
+            conj!(x)
+            ccall(($(string(fE)), libfasttransforms), Cvoid, (Cint, Ptr{ft_plan_struct}, Ptr{Float64}, Cint, Cint), 'C', p, x, size(x, 1), size(x, 2))
+            conj!(x)
+            return x
+        end
+    end
 end
 
-function lmul!(p::FTPlan{Float64, 3, TETRAHEDRONANALYSIS}, x::Array{Float64, 3})
-    if p.n != size(x, 1) || p.l != size(x, 2) || p.m != size(x, 3)
-        throw(DimensionMismatch("FTPlan has dimensions $(p.n) × $(p.l) × $(p.m), x has dimensions $(size(x, 1)) × $(size(x, 2)) × $(size(x, 3))"))
-    end
-    ccall((:ft_execute_tet_analysis, libfasttransforms), Cvoid, (Ptr{ft_plan_struct}, Ptr{Float64}, Cint, Cint, Cint), p, x, size(x, 1), size(x, 2), size(x, 3))
-    return x
+function plan_sph_isometry(::Type{Float64}, n::Integer)
+    plan = ccall((:ft_plan_sph_isometry, libfasttransforms), Ptr{ft_plan_struct}, (Cint, ), n)
+    return FTPlan{Float64, 2, SPHERICALISOMETRY}(plan, n)
 end
 
+*(p::FTPlan{T}, x::AbstractArray{T}) where T = lmul!(p, Array(x))
+*(p::AdjointFTPlan{T}, x::AbstractArray{T}) where T = lmul!(p, Array(x))
+*(p::TransposeFTPlan{T}, x::AbstractArray{T}) where T = lmul!(p, Array(x))
+\(p::FTPlan{T}, x::AbstractArray{T}) where T = ldiv!(p, Array(x))
+\(p::AdjointFTPlan{T}, x::AbstractArray{T}) where T = ldiv!(p, Array(x))
+\(p::TransposeFTPlan{T}, x::AbstractArray{T}) where T = ldiv!(p, Array(x))
 
-*(p::FTPlan{T}, x::Array{T}) where T = lmul!(p, deepcopy(x))
-*(p::AdjointFTPlan{T}, x::Array{T}) where T = lmul!(p, deepcopy(x))
-*(p::TransposeFTPlan{T}, x::Array{T}) where T = lmul!(p, deepcopy(x))
-\(p::FTPlan{T}, x::Array{T}) where T = ldiv!(p, deepcopy(x))
-\(p::AdjointFTPlan{T}, x::Array{T}) where T = ldiv!(p, deepcopy(x))
-\(p::TransposeFTPlan{T}, x::Array{T}) where T = ldiv!(p, deepcopy(x))
+*(p::FTPlan{T, 1}, x::UniformScaling{S}) where {T, S} = UpperTriangular(lmul!(p, Matrix{promote_type(T, S)}(x, p.n, p.n)))
+*(p::AdjointFTPlan{T, FTPlan{T, 1, K}}, x::UniformScaling{S}) where {T, S, K} = LowerTriangular(lmul!(p, Matrix{promote_type(T, S)}(x, p.parent.n, p.parent.n)))
+*(p::TransposeFTPlan{T, FTPlan{T, 1, K}}, x::UniformScaling{S}) where {T, S, K} = LowerTriangular(lmul!(p, Matrix{promote_type(T, S)}(x, p.parent.n, p.parent.n)))
+\(p::FTPlan{T, 1}, x::UniformScaling{S}) where {T, S} = UpperTriangular(ldiv!(p, Matrix{promote_type(T, S)}(x, p.n, p.n)))
+\(p::AdjointFTPlan{T, FTPlan{T, 1, K}}, x::UniformScaling{S}) where {T, S, K} = LowerTriangular(ldiv!(p, Matrix{promote_type(T, S)}(x, p.parent.n, p.parent.n)))
+\(p::TransposeFTPlan{T, FTPlan{T, 1, K}}, x::UniformScaling{S}) where {T, S, K} = LowerTriangular(ldiv!(p, Matrix{promote_type(T, S)}(x, p.parent.n, p.parent.n)))
+
+const AbstractUpperTriangular{T, S <: AbstractMatrix} = Union{UpperTriangular{T, S}, UnitUpperTriangular{T, S}}
+const AbstractLowerTriangular{T, S <: AbstractMatrix} = Union{LowerTriangular{T, S}, UnitLowerTriangular{T, S}}
+
+*(p::FTPlan{T, 1}, x::AbstractUpperTriangular) where T = UpperTriangular(lmul!(p, Array(x)))
+*(p::AdjointFTPlan{T, 1}, x::AbstractLowerTriangular) where T = LowerTriangular(lmul!(p, Array(x)))
+*(p::TransposeFTPlan{T, 1}, x::AbstractLowerTriangular) where T = LowerTriangular(lmul!(p, Array(x)))
+\(p::FTPlan{T, 1}, x::AbstractUpperTriangular) where T = UpperTriangular(ldiv!(p, Array(x)))
+\(p::AdjointFTPlan{T, 1}, x::AbstractLowerTriangular) where T = LowerTriangular(ldiv!(p, Array(x)))
+\(p::TransposeFTPlan{T, 1}, x::AbstractLowerTriangular) where T = LowerTriangular(ldiv!(p, Array(x)))
 
 for (fJ, fC, elty) in ((:lmul!, :ft_bfmvf, :Float32),
                        (:ldiv!, :ft_bfsvf, :Float32),
                        (:lmul!, :ft_bfmv , :Float64),
                        (:ldiv!, :ft_bfsv , :Float64))
     @eval begin
-        function $fJ(p::FTPlan{$elty, 1}, x::Vector{$elty})
+        function $fJ(p::FTPlan{$elty, 1}, x::StridedVector{$elty})
+            checksize(p, x)
+            checkstride(p, x)
+            ccall(($(string(fC)), libfasttransforms), Cvoid, (Cint, Ptr{ft_plan_struct}, Ptr{$elty}), 'N', p, x)
+            return x
+        end
+        function $fJ(p::AdjointFTPlan{$elty, FTPlan{$elty, 1, K}}, x::StridedVector{$elty}) where K
+            checksize(p, x)
+            checkstride(p, x)
+            ccall(($(string(fC)), libfasttransforms), Cvoid, (Cint, Ptr{ft_plan_struct}, Ptr{$elty}), 'T', p, x)
+            return x
+        end
+        function $fJ(p::TransposeFTPlan{$elty, FTPlan{$elty, 1, K}}, x::StridedVector{$elty}) where K
+            checksize(p, x)
+            checkstride(p, x)
+            ccall(($(string(fC)), libfasttransforms), Cvoid, (Cint, Ptr{ft_plan_struct}, Ptr{$elty}), 'T', p, x)
+            return x
+        end
+    end
+end
+
+for (fJ, fC, elty) in ((:lmul!, :ft_bbbfmvf, :Float32),
+                       (:lmul!, :ft_bbbfmv , :Float64))
+    @eval begin
+        function $fJ(p::FTPlan{$elty, 1, ASSOCIATEDJAC2JAC}, x::StridedVector{$elty})
+            checksize(p, x)
+            checkstride(p, x)
+            ccall(($(string(fC)), libfasttransforms), Cvoid, (Cint, Cint, Cint, Ptr{ft_plan_struct}, Ptr{$elty}), 'N', '2', '1', p, x)
+            return x
+        end
+        function $fJ(p::AdjointFTPlan{$elty, FTPlan{$elty, 1, ASSOCIATEDJAC2JAC}}, x::StridedVector{$elty})
+            checksize(p, x)
+            checkstride(p, x)
+            ccall(($(string(fC)), libfasttransforms), Cvoid, (Cint, Cint, Cint, Ptr{ft_plan_struct}, Ptr{$elty}), 'T', '1', '2', p, x)
+            return x
+        end
+        function $fJ(p::TransposeFTPlan{$elty, FTPlan{$elty, 1, ASSOCIATEDJAC2JAC}}, x::StridedVector{$elty})
+            checksize(p, x)
+            checkstride(p, x)
+            ccall(($(string(fC)), libfasttransforms), Cvoid, (Cint, Cint, Cint, Ptr{ft_plan_struct}, Ptr{$elty}), 'T', '1', '2', p, x)
+            return x
+        end
+    end
+end
+
+for (fJ, fC, elty) in ((:lmul!, :ft_mpmvf, :Float32),
+                       (:ldiv!, :ft_mpsvf, :Float32),
+                       (:lmul!, :ft_mpmv , :Float64),
+                       (:ldiv!, :ft_mpsv , :Float64))
+    @eval begin
+        function $fJ(p::ModifiedFTPlan{$elty}, x::StridedVector{$elty})
             checksize(p, x)
+            checkstride(p, x)
             ccall(($(string(fC)), libfasttransforms), Cvoid, (Cint, Ptr{ft_plan_struct}, Ptr{$elty}), 'N', p, x)
             return x
         end
-        function $fJ(p::AdjointFTPlan{$elty, FTPlan{$elty, 1, K}}, x::Vector{$elty}) where K
+        function $fJ(p::AdjointFTPlan{$elty, ModifiedFTPlan{$elty}}, x::StridedVector{$elty})
             checksize(p, x)
+            checkstride(p, x)
             ccall(($(string(fC)), libfasttransforms), Cvoid, (Cint, Ptr{ft_plan_struct}, Ptr{$elty}), 'T', p, x)
             return x
         end
-        function $fJ(p::TransposeFTPlan{$elty, FTPlan{$elty, 1, K}}, x::Vector{$elty}) where K
+        function $fJ(p::TransposeFTPlan{$elty, ModifiedFTPlan{$elty}}, x::StridedVector{$elty})
             checksize(p, x)
+            checkstride(p, x)
             ccall(($(string(fC)), libfasttransforms), Cvoid, (Cint, Ptr{ft_plan_struct}, Ptr{$elty}), 'T', p, x)
             return x
         end
     end
 end
 
-for (fJ, fC) in ((:lmul!, :ft_mpfr_trmv),
-                 (:ldiv!, :ft_mpfr_trsv))
+for (fJ, fC) in ((:lmul!, :ft_mpfr_trmv_ptr),
+                 (:ldiv!, :ft_mpfr_trsv_ptr))
     @eval begin
-        function $fJ(p::FTPlan{BigFloat, 1}, x::Vector{BigFloat})
+        function $fJ(p::FTPlan{BigFloat, 1}, x::StridedVector{BigFloat})
             checksize(p, x)
-            xt = deepcopy.(x)
-            xc = mpfr_t.(xt)
-            ccall(($(string(fC)), libfasttransforms), Cvoid, (Cint, Cint, Ptr{mpfr_t}, Cint, Ptr{mpfr_t}, Int32), 'N', p.n, p, p.n, xc, Base.MPFR.ROUNDING_MODE[])
-            x .= BigFloat.(xc)
+            checkstride(p, x)
+            ccall(($(string(fC)), libfasttransforms), Cvoid, (Cint, Cint, Ptr{mpfr_t}, Cint, Ptr{BigFloat}, Int32), 'N', p.n, p, p.n, renew!(x), Base.MPFR.ROUNDING_MODE[])
             return x
         end
-        function $fJ(p::AdjointFTPlan{BigFloat, FTPlan{BigFloat, 1, K}}, x::Vector{BigFloat}) where K
+        function $fJ(p::AdjointFTPlan{BigFloat, FTPlan{BigFloat, 1, K}}, x::StridedVector{BigFloat}) where K
             checksize(p, x)
-            xt = deepcopy.(x)
-            xc = mpfr_t.(xt)
-            ccall(($(string(fC)), libfasttransforms), Cvoid, (Cint, Cint, Ptr{mpfr_t}, Cint, Ptr{mpfr_t}, Int32), 'T', p.parent.n, p, p.parent.n, xc, Base.MPFR.ROUNDING_MODE[])
-            x .= BigFloat.(xc)
+            checkstride(p, x)
+            ccall(($(string(fC)), libfasttransforms), Cvoid, (Cint, Cint, Ptr{mpfr_t}, Cint, Ptr{BigFloat}, Int32), 'T', p.parent.n, p, p.parent.n, renew!(x), Base.MPFR.ROUNDING_MODE[])
             return x
         end
-        function $fJ(p::TransposeFTPlan{BigFloat, FTPlan{BigFloat, 1, K}}, x::Vector{BigFloat}) where K
+        function $fJ(p::TransposeFTPlan{BigFloat, FTPlan{BigFloat, 1, K}}, x::StridedVector{BigFloat}) where K
             checksize(p, x)
-            xt = deepcopy.(x)
-            xc = mpfr_t.(xt)
-            ccall(($(string(fC)), libfasttransforms), Cvoid, (Cint, Cint, Ptr{mpfr_t}, Cint, Ptr{mpfr_t}, Int32), 'T', p.parent.n, p, p.parent.n, xc, Base.MPFR.ROUNDING_MODE[])
-            x .= BigFloat.(xc)
+            checkstride(p, x)
+            ccall(($(string(fC)), libfasttransforms), Cvoid, (Cint, Cint, Ptr{mpfr_t}, Cint, Ptr{BigFloat}, Int32), 'T', p.parent.n, p, p.parent.n, renew!(x), Base.MPFR.ROUNDING_MODE[])
             return x
         end
     end
@@ -545,89 +1006,216 @@ for (fJ, fC, elty) in ((:lmul!, :ft_bfmmf, :Float32),
                        (:lmul!, :ft_bfmm , :Float64),
                        (:ldiv!, :ft_bfsm , :Float64))
     @eval begin
-        function $fJ(p::FTPlan{$elty, 1}, x::Matrix{$elty})
+        function $fJ(p::FTPlan{$elty, 1}, x::StridedMatrix{$elty})
             checksize(p, x)
-            ccall(($(string(fC)), libfasttransforms), Cvoid, (Cint, Ptr{ft_plan_struct}, Ptr{$elty}, Cint, Cint), 'N', p, x, size(x, 1), size(x, 2))
+            checkstride(p, x)
+            ccall(($(string(fC)), libfasttransforms), Cvoid, (Cint, Ptr{ft_plan_struct}, Ptr{$elty}, Cint, Cint), 'N', p, x, stride(x, 2), size(x, 2))
             return x
         end
-        function $fJ(p::AdjointFTPlan{$elty, FTPlan{$elty, 1, K}}, x::Matrix{$elty}) where K
+        function $fJ(p::AdjointFTPlan{$elty, FTPlan{$elty, 1, K}}, x::StridedMatrix{$elty}) where K
             checksize(p, x)
-            ccall(($(string(fC)), libfasttransforms), Cvoid, (Cint, Ptr{ft_plan_struct}, Ptr{$elty}, Cint, Cint), 'T', p, x, size(x, 1), size(x, 2))
+            checkstride(p, x)
+            ccall(($(string(fC)), libfasttransforms), Cvoid, (Cint, Ptr{ft_plan_struct}, Ptr{$elty}, Cint, Cint), 'T', p, x, stride(x, 2), size(x, 2))
             return x
         end
-        function $fJ(p::TransposeFTPlan{$elty, FTPlan{$elty, 1, K}}, x::Matrix{$elty}) where K
+        function $fJ(p::TransposeFTPlan{$elty, FTPlan{$elty, 1, K}}, x::StridedMatrix{$elty}) where K
             checksize(p, x)
-            ccall(($(string(fC)), libfasttransforms), Cvoid, (Cint, Ptr{ft_plan_struct}, Ptr{$elty}, Cint, Cint), 'T', p, x, size(x, 1), size(x, 2))
+            checkstride(p, x)
+            ccall(($(string(fC)), libfasttransforms), Cvoid, (Cint, Ptr{ft_plan_struct}, Ptr{$elty}, Cint, Cint), 'T', p, x, stride(x, 2), size(x, 2))
             return x
         end
     end
 end
 
-for (fJ, fC) in ((:lmul!, :ft_mpfr_trmm),
-                 (:ldiv!, :ft_mpfr_trsm))
+for (fJ, fC, elty) in ((:lmul!, :ft_bbbfmmf, :Float32),
+                       (:lmul!, :ft_bbbfmm , :Float64))
     @eval begin
-        function $fJ(p::FTPlan{BigFloat, 1}, x::Matrix{BigFloat})
+        function $fJ(p::FTPlan{$elty, 1, ASSOCIATEDJAC2JAC}, x::StridedMatrix{$elty})
             checksize(p, x)
-            xt = deepcopy.(x)
-            xc = mpfr_t.(xt)
-            ccall(($(string(fC)), libfasttransforms), Cvoid, (Cint, Cint, Ptr{mpfr_t}, Cint, Ptr{mpfr_t}, Cint, Cint, Int32), 'N', p.n, p, p.n, xc, size(x, 1), size(x, 2), Base.MPFR.ROUNDING_MODE[])
-            x .= BigFloat.(xc)
+            checkstride(p, x)
+            ccall(($(string(fC)), libfasttransforms), Cvoid, (Cint, Cint, Cint, Ptr{ft_plan_struct}, Ptr{$elty}, Cint, Cint), 'N', '2', '1', p, x, stride(x, 2), size(x, 2))
             return x
         end
-        function $fJ(p::AdjointFTPlan{BigFloat, FTPlan{BigFloat, 1, K}}, x::Matrix{BigFloat}) where K
+        function $fJ(p::AdjointFTPlan{$elty, FTPlan{$elty, 1, ASSOCIATEDJAC2JAC}}, x::StridedMatrix{$elty})
             checksize(p, x)
-            xt = deepcopy.(x)
-            xc = mpfr_t.(xt)
-            ccall(($(string(fC)), libfasttransforms), Cvoid, (Cint, Cint, Ptr{mpfr_t}, Cint, Ptr{mpfr_t}, Cint, Cint, Int32), 'T', p.parent.n, p, p.parent.n, xc, size(x, 1), size(x, 2), Base.MPFR.ROUNDING_MODE[])
-            x .= BigFloat.(xc)
+            checkstride(p, x)
+            ccall(($(string(fC)), libfasttransforms), Cvoid, (Cint, Cint, Cint, Ptr{ft_plan_struct}, Ptr{$elty}, Cint, Cint), 'T', '1', '2', p, x, stride(x, 2), size(x, 2))
             return x
         end
-        function $fJ(p::TransposeFTPlan{BigFloat, FTPlan{BigFloat, 1, K}}, x::Matrix{BigFloat}) where K
+        function $fJ(p::TransposeFTPlan{$elty, FTPlan{$elty, 1, ASSOCIATEDJAC2JAC}}, x::StridedMatrix{$elty})
             checksize(p, x)
-            xt = deepcopy.(x)
-            xc = mpfr_t.(xt)
-            ccall(($(string(fC)), libfasttransforms), Cvoid, (Cint, Cint, Ptr{mpfr_t}, Cint, Ptr{mpfr_t}, Cint, Cint, Int32), 'T', p.parent.n, p, p.parent.n, xc, size(x, 1), size(x, 2), Base.MPFR.ROUNDING_MODE[])
-            x .= BigFloat.(xc)
+            checkstride(p, x)
+            ccall(($(string(fC)), libfasttransforms), Cvoid, (Cint, Cint, Cint, Ptr{ft_plan_struct}, Ptr{$elty}, Cint, Cint), 'T', '1', '2', p, x, stride(x, 2), size(x, 2))
             return x
         end
     end
 end
 
-for (fJ, fC, K) in ((:lmul!, :ft_execute_sph2fourier, SPHERE),
-                    (:ldiv!, :ft_execute_fourier2sph, SPHERE),
-                    (:lmul!, :ft_execute_sphv2fourier, SPHEREV),
-                    (:ldiv!, :ft_execute_fourier2sphv, SPHEREV),
-                    (:lmul!, :ft_execute_disk2cxf, DISK),
-                    (:ldiv!, :ft_execute_cxf2disk, DISK),
-                    (:lmul!, :ft_execute_tri2cheb, TRIANGLE),
-                    (:ldiv!, :ft_execute_cheb2tri, TRIANGLE))
+for (fJ, fC, elty) in ((:lmul!, :ft_mpmmf, :Float32),
+                       (:ldiv!, :ft_mpsmf, :Float32),
+                       (:lmul!, :ft_mpmm , :Float64),
+                       (:ldiv!, :ft_mpsm , :Float64))
     @eval begin
-        function $fJ(p::FTPlan{Float64, 2, $K}, x::Matrix{Float64})
+        function $fJ(p::ModifiedFTPlan{$elty}, x::StridedMatrix{$elty})
             checksize(p, x)
-            ccall(($(string(fC)), libfasttransforms), Cvoid, (Ptr{ft_plan_struct}, Ptr{Float64}, Cint, Cint), p, x, size(x, 1), size(x, 2))
+            checkstride(p, x)
+            ccall(($(string(fC)), libfasttransforms), Cvoid, (Cint, Ptr{ft_plan_struct}, Ptr{$elty}, Cint, Cint), 'N', p, x, stride(x, 2), size(x, 2))
+            return x
+        end
+        function $fJ(p::AdjointFTPlan{$elty, ModifiedFTPlan{$elty}}, x::StridedMatrix{$elty})
+            checksize(p, x)
+            checkstride(p, x)
+            ccall(($(string(fC)), libfasttransforms), Cvoid, (Cint, Ptr{ft_plan_struct}, Ptr{$elty}, Cint, Cint), 'T', p, x, stride(x, 2), size(x, 2))
+            return x
+        end
+        function $fJ(p::TransposeFTPlan{$elty, ModifiedFTPlan{$elty}}, x::StridedMatrix{$elty})
+            checksize(p, x)
+            checkstride(p, x)
+            ccall(($(string(fC)), libfasttransforms), Cvoid, (Cint, Ptr{ft_plan_struct}, Ptr{$elty}, Cint, Cint), 'T', p, x, stride(x, 2), size(x, 2))
             return x
         end
     end
 end
 
-function lmul!(p::FTPlan{Float64, 3, TETRAHEDRON}, x::Array{Float64, 3})
+for (fJ, fC) in ((:lmul!, :ft_mpfr_trmm_ptr),
+                 (:ldiv!, :ft_mpfr_trsm_ptr))
+    @eval begin
+        function $fJ(p::FTPlan{BigFloat, 1}, x::StridedMatrix{BigFloat})
+            checksize(p, x)
+            checkstride(p, x)
+            ccall(($(string(fC)), libfasttransforms), Cvoid, (Cint, Cint, Ptr{mpfr_t}, Cint, Ptr{BigFloat}, Cint, Cint, Int32), 'N', p.n, p, p.n, renew!(x), stride(x, 2), size(x, 2), Base.MPFR.ROUNDING_MODE[])
+            return x
+        end
+        function $fJ(p::AdjointFTPlan{BigFloat, FTPlan{BigFloat, 1, K}}, x::StridedMatrix{BigFloat}) where K
+            checksize(p, x)
+            checkstride(p, x)
+            ccall(($(string(fC)), libfasttransforms), Cvoid, (Cint, Cint, Ptr{mpfr_t}, Cint, Ptr{BigFloat}, Cint, Cint, Int32), 'T', p.parent.n, p, p.parent.n, renew!(x), stride(x, 2), size(x, 2), Base.MPFR.ROUNDING_MODE[])
+            return x
+        end
+        function $fJ(p::TransposeFTPlan{BigFloat, FTPlan{BigFloat, 1, K}}, x::StridedMatrix{BigFloat}) where K
+            checksize(p, x)
+            checkstride(p, x)
+            ccall(($(string(fC)), libfasttransforms), Cvoid, (Cint, Cint, Ptr{mpfr_t}, Cint, Ptr{BigFloat}, Cint, Cint, Int32), 'T', p.parent.n, p, p.parent.n, renew!(x), stride(x, 2), size(x, 2), Base.MPFR.ROUNDING_MODE[])
+            return x
+        end
+    end
+end
+
+for (fJ, fC, T, N, K) in ((:lmul!, :ft_execute_sph2fourier, Float64, 2, SPHERE),
+                          (:ldiv!, :ft_execute_fourier2sph, Float64, 2, SPHERE),
+                          (:lmul!, :ft_execute_sphv2fourier, Float64, 2, SPHEREV),
+                          (:ldiv!, :ft_execute_fourier2sphv, Float64, 2, SPHEREV),
+                          (:lmul!, :ft_execute_spinsph2fourier, Complex{Float64}, 2, SPINSPHERE),
+                          (:ldiv!, :ft_execute_fourier2spinsph, Complex{Float64}, 2, SPINSPHERE),
+                          (:lmul!, :ft_execute_disk2cxf, Float64, 2, DISK),
+                          (:ldiv!, :ft_execute_cxf2disk, Float64, 2, DISK),
+                          (:lmul!, :ft_execute_ann2cxf, Float64, 2, ANNULUS),
+                          (:ldiv!, :ft_execute_cxf2ann, Float64, 2, ANNULUS),
+                          (:lmul!, :ft_execute_rectdisk2cheb, Float64, 2, RECTDISK),
+                          (:ldiv!, :ft_execute_cheb2rectdisk, Float64, 2, RECTDISK),
+                          (:lmul!, :ft_execute_tri2cheb, Float64, 2, TRIANGLE),
+                          (:ldiv!, :ft_execute_cheb2tri, Float64, 2, TRIANGLE))
+    @eval begin
+        function $fJ(p::FTPlan{$T, $N, $K}, x::Array{$T, $N})
+            checksize(p, x)
+            ccall(($(string(fC)), libfasttransforms), Cvoid, (Cint, Ptr{ft_plan_struct}, Ptr{$T}, Cint, Cint), 'N', p, x, size(x, 1), size(x, 2))
+            return x
+        end
+        function $fJ(p::AdjointFTPlan{$T, FTPlan{$T, $N, $K}}, x::Array{$T, $N})
+            checksize(p, x)
+            ccall(($(string(fC)), libfasttransforms), Cvoid, (Cint, Ptr{ft_plan_struct}, Ptr{$T}, Cint, Cint), 'T', p, x, size(x, 1), size(x, 2))
+            return x
+        end
+        function $fJ(p::TransposeFTPlan{$T, FTPlan{$T, $N, $K}}, x::Array{$T, $N})
+            checksize(p, x)
+            ccall(($(string(fC)), libfasttransforms), Cvoid, (Cint, Ptr{ft_plan_struct}, Ptr{$T}, Cint, Cint), 'T', p, x, size(x, 1), size(x, 2))
+            return x
+        end
+    end
+end
+
+for (fJ, fC) in ((:lmul!, :ft_execute_tet2cheb),
+                 (:ldiv!, :ft_execute_cheb2tet))
+    @eval begin
+        function $fJ(p::FTPlan{Float64, 3, TETRAHEDRON}, x::Array{Float64, 3})
+            checksize(p, x)
+            ccall(($(string(fC)), libfasttransforms), Cvoid, (Cint, Ptr{ft_plan_struct}, Ptr{Float64}, Cint, Cint, Cint), 'N', p, x, size(x, 1), size(x, 2), size(x, 3))
+            return x
+        end
+        function $fJ(p::AdjointFTPlan{Float64, FTPlan{Float64, 3, TETRAHEDRON}}, x::Array{Float64, 3})
+            checksize(p, x)
+            ccall(($(string(fC)), libfasttransforms), Cvoid, (Cint, Ptr{ft_plan_struct}, Ptr{Float64}, Cint, Cint, Cint), 'T', p, x, size(x, 1), size(x, 2), size(x, 3))
+            return x
+        end
+        function $fJ(p::TransposeFTPlan{Float64, FTPlan{Float64, 3, TETRAHEDRON}}, x::Array{Float64, 3})
+            checksize(p, x)
+            ccall(($(string(fC)), libfasttransforms), Cvoid, (Cint, Ptr{ft_plan_struct}, Ptr{Float64}, Cint, Cint, Cint), 'T', p, x, size(x, 1), size(x, 2), size(x, 3))
+            return x
+        end
+    end
+end
+
+function execute_sph_polar_rotation!(x::Matrix{Float64}, α)
+    ccall((:ft_execute_sph_polar_rotation, libfasttransforms), Cvoid, (Ptr{Float64}, Cint, Cint, Float64, Float64), x, size(x, 1), size(x, 2), sin(α), cos(α))
+    return x
+end
+
+function execute_sph_polar_reflection!(x::Matrix{Float64})
+    ccall((:ft_execute_sph_polar_reflection, libfasttransforms), Cvoid, (Ptr{Float64}, Cint, Cint), x, size(x, 1), size(x, 2))
+    return x
+end
+
+struct ft_orthogonal_transformation
+    Q::NTuple{9, Float64}
+end
+
+function convert(::Type{ft_orthogonal_transformation}, Q::AbstractMatrix)
+    @assert size(Q, 1) ≥ 3 && size(Q, 2) ≥ 3
+    return ft_orthogonal_transformation((Q[1, 1], Q[2, 1], Q[3, 1], Q[1, 2], Q[2, 2], Q[3, 2], Q[1, 3], Q[2, 3], Q[3, 3]))
+end
+convert(::Type{ft_orthogonal_transformation}, Q::NTuple{9, Float64}) = ft_orthogonal_transformation(Q)
+
+function execute_sph_orthogonal_transformation!(p::FTPlan{Float64, 2, SPHERICALISOMETRY}, Q, x::Matrix{Float64})
+    checksize(p, x)
+    ccall((:ft_execute_sph_orthogonal_transformation, libfasttransforms), Cvoid, (Ptr{ft_plan_struct}, ft_orthogonal_transformation, Ptr{Float64}, Cint, Cint), p, Q, x, size(x, 1), size(x, 2))
+    return x
+end
+
+function execute_sph_yz_axis_exchange!(p::FTPlan{Float64, 2, SPHERICALISOMETRY}, x::Matrix{Float64})
+    checksize(p, x)
+    ccall((:ft_execute_sph_yz_axis_exchange, libfasttransforms), Cvoid, (Ptr{ft_plan_struct}, Ptr{Float64}, Cint, Cint), p, x, size(x, 1), size(x, 2))
+    return x
+end
+
+function execute_sph_rotation!(p::FTPlan{Float64, 2, SPHERICALISOMETRY}, α, β, γ, x::Matrix{Float64})
     checksize(p, x)
-    ccall((:ft_execute_tet2cheb, libfasttransforms), Cvoid, (Ptr{ft_plan_struct}, Ptr{Float64}, Cint, Cint, Cint), p, x, size(x, 1), size(x, 2), size(x, 3))
+    ccall((:ft_execute_sph_rotation, libfasttransforms), Cvoid, (Ptr{ft_plan_struct}, Float64, Float64, Float64, Ptr{Float64}, Cint, Cint), p, α, β, γ, x, size(x, 1), size(x, 2))
     return x
 end
 
-function ldiv!(p::FTPlan{Float64, 3, TETRAHEDRON}, x::Array{Float64, 3})
+struct ft_reflection
+    w::NTuple{3, Float64}
+end
+
+function convert(::Type{ft_reflection}, w::AbstractVector)
+    @assert length(w) ≥ 3
+    return ft_reflection((w[1], w[2], w[3]))
+end
+convert(::Type{ft_reflection}, w::NTuple{3, Float64}) = ft_reflection(w)
+
+function execute_sph_reflection!(p::FTPlan{Float64, 2, SPHERICALISOMETRY}, w, x::Matrix{Float64})
     checksize(p, x)
-    ccall((:ft_execute_cheb2tet, libfasttransforms), Cvoid, (Ptr{ft_plan_struct}, Ptr{Float64}, Cint, Cint, Cint), p, x, size(x, 1), size(x, 2), size(x, 3))
+    ccall((:ft_execute_sph_reflection, libfasttransforms), Cvoid, (Ptr{ft_plan_struct}, ft_reflection, Ptr{Float64}, Cint, Cint), p, w, x, size(x, 1), size(x, 2))
     return x
 end
+execute_sph_reflection!(p::FTPlan{Float64, 2, SPHERICALISOMETRY}, w1, w2, w3, x::Matrix{Float64}) = execute_sph_reflection!(p, ft_reflection(w1, w2, w3), x)
 
-*(p::FTPlan{T}, x::Array{Complex{T}}) where T = lmul!(p, deepcopy(x))
-*(p::AdjointFTPlan{T}, x::Array{Complex{T}}) where T = lmul!(p, deepcopy(x))
-*(p::TransposeFTPlan{T}, x::Array{Complex{T}}) where T = lmul!(p, deepcopy(x))
-\(p::FTPlan{T}, x::Array{Complex{T}}) where T = ldiv!(p, deepcopy(x))
-\(p::AdjointFTPlan{T}, x::Array{Complex{T}}) where T = ldiv!(p, deepcopy(x))
-\(p::TransposeFTPlan{T}, x::Array{Complex{T}}) where T = ldiv!(p, deepcopy(x))
+*(p::FTPlan{T}, x::AbstractArray{Complex{T}}) where T = lmul!(p, Array(x))
+*(p::AdjointFTPlan{T}, x::AbstractArray{Complex{T}}) where T = lmul!(p, Array(x))
+*(p::TransposeFTPlan{T}, x::AbstractArray{Complex{T}}) where T = lmul!(p, Array(x))
+\(p::FTPlan{T}, x::AbstractArray{Complex{T}}) where T = ldiv!(p, Array(x))
+\(p::AdjointFTPlan{T}, x::AbstractArray{Complex{T}}) where T = ldiv!(p, Array(x))
+\(p::TransposeFTPlan{T}, x::AbstractArray{Complex{T}}) where T = ldiv!(p, Array(x))
 
 for fJ in (:lmul!, :ldiv!)
     @eval begin
@@ -635,13 +1223,37 @@ for fJ in (:lmul!, :ldiv!)
             x .= complex.($fJ(p, real(x)), $fJ(p, imag(x)))
             return x
         end
-        function $fJ(p::AdjointFTPlan{T, FTPlan{T, N, K}}, x::AbstractArray{Complex{T}}) where {T, N, K}
+        function $fJ(p::AdjointFTPlan{T}, x::AbstractArray{Complex{T}}) where T
             x .= complex.($fJ(p, real(x)), $fJ(p, imag(x)))
             return x
         end
-        function $fJ(p::TransposeFTPlan{T, FTPlan{T, N, K}}, x::AbstractArray{Complex{T}}) where {T, N, K}
+        function $fJ(p::TransposeFTPlan{T}, x::AbstractArray{Complex{T}}) where T
             x .= complex.($fJ(p, real(x)), $fJ(p, imag(x)))
             return x
         end
     end
 end
+
+for (fC, T) in ((:execute_jacobi_similarityf, Float32), (:execute_jacobi_similarity, Float64))
+    @eval begin
+        function modified_jacobi_matrix(P::ModifiedFTPlan{$T}, XP::SymTridiagonal{$T, Vector{$T}})
+            n = min(P.n, size(XP, 1))
+            XQ = SymTridiagonal(Vector{$T}(undef, n-1), Vector{$T}(undef, n-2))
+            ccall(($(string(fC)), libfasttransforms), Cvoid, (Ptr{ft_plan_struct}, Cint, Ptr{$T}, Ptr{$T}, Ptr{$T}, Ptr{$T}), P, n, XP.dv, XP.ev, XQ.dv, XQ.ev)
+            return XQ
+        end
+    end
+end
+
+function modified_jacobi_matrix(R, XP)
+    n = size(R, 1) - 1
+    XQ = SymTridiagonal(zeros(n), zeros(n-1))
+    XQ.dv[1] = (R[1, 1]*XP[1, 1] + R[1, 2]*XP[2, 1])/R[1, 1]
+    for i in 1:n-1
+        XQ.ev[i] = R[i+1, i+1]*XP[i+1, i]/R[i, i]
+    end
+    for i in 2:n
+        XQ.dv[i] = (R[i, i]*XP[i,i] + R[i, i+1]*XP[i+1, i] - XQ[i, i-1]*R[i-1, i])/R[i, i]
+    end
+    return XQ
+end
diff --git a/src/nufft.jl b/src/nufft.jl
index faef09f7..2e49f37d 100644
--- a/src/nufft.jl
+++ b/src/nufft.jl
@@ -187,6 +187,9 @@ mul_for_col_J!(y::AbstractVecOrMat{T}, A::AbstractMatrix{T}, x::AbstractVecOrMat
 function mul_for_col_J!(y::AbstractVecOrMat{T}, A::AbstractMatrix{T}, x::AbstractVecOrMat{T}, istart::Int, jstart::Int, INCX::Int, INCY::Int) where T
     m, n = size(A)
     ishift, jshift = istart-INCY, jstart-INCX
+    @inbounds for i = 1:m
+        y[ishift+i*INCY] = zero(T)
+    end
     @inbounds for j = 1:n
         xj = x[jshift+j*INCX]
         for i = 1:m
diff --git a/src/specialfunctions.jl b/src/specialfunctions.jl
index ca74344b..ba6e0ea1 100644
--- a/src/specialfunctions.jl
+++ b/src/specialfunctions.jl
@@ -56,6 +56,8 @@ function pochhammer(x::Number,n::UnitRange{T}) where T<:Real
     ret
 end
 
+lgamma(x) = logabsgamma(x)[1]
+
 ogamma(x::Number) = (isinteger(x) && x<0) ? zero(float(x)) : inv(gamma(x))
 
 """
@@ -133,7 +135,8 @@ end
 """
 The Lambda function ``\\Lambda(z) = \\frac{\\Gamma(z+\\frac{1}{2})}{\\Gamma(z+1)}`` for the ratio of gamma functions.
 """
-Λ(z::Number) = exp(lgamma(z+half(z))-lgamma(z+one(z)))
+Λ(z::Number) = Λ(z, half(z), one(z))
+
 """
 For 64-bit floating-point arithmetic, the Lambda function uses the asymptotic series for ``\\tau`` in Appendix B of
 
@@ -151,12 +154,18 @@ end
 """
 The Lambda function ``\\Lambda(z,λ₁,λ₂) = \\frac{\\Gamma(z+\\lambda_1)}{Γ(z+\\lambda_2)}`` for the ratio of gamma functions.
 """
-Λ(z::Number,λ₁::Number,λ₂::Number) = exp(lgamma(z+λ₁)-lgamma(z+λ₂))
-function Λ(x::Float64,λ₁::Float64,λ₂::Float64)
+function Λ(z::Real, λ₁::Real, λ₂::Real)
+    if z+λ₁ > 0 && z+λ₂ > 0
+        exp(lgamma(z+λ₁)-lgamma(z+λ₂))
+    else
+        gamma(z+λ₁)/gamma(z+λ₂)
+    end
+end
+function Λ(x::Float64, λ₁::Float64, λ₂::Float64)
     if min(x+λ₁,x+λ₂) ≥ 8.979120323411497
         exp(λ₂-λ₁+(x-.5)*log1p((λ₁-λ₂)/(x+λ₂)))*(x+λ₁)^λ₁/(x+λ₂)^λ₂*stirlingseries(x+λ₁)/stirlingseries(x+λ₂)
     else
-        (x+λ₂)/(x+λ₁)*Λ(x+1.,λ₁,λ₂)
+        (x+λ₂)/(x+λ₁)*Λ(x + 1.0, λ₁, λ₂)
     end
 end
 
@@ -259,6 +268,34 @@ function chebyshevmoments1(::Type{T}, N::Int) where T
     μ
 end
 
+"""
+Modified Chebyshev moments of the first kind:
+
+```math
+    \\int_^a T_n(x) {\\rm\\,d}x.
+```
+"""
+function chebyshevmoments1(::Type{T}, N::Int, a::T) where T
+    μ = zeros(T, N)
+    μ[1] = a
+    μ[2] = a^2/2
+    θ = acos(a)
+    for i = 2:N-1
+        @inbounds μ[i+1] = (cos((i+1)*θ)/(i+1) - cos((i-1)*θ)/(i-1))/2
+    end
+    μ
+end
+
+function chebyshevmoments1(::Type{T}, N::Int, a::NTuple{L, T}, w::NTuple{M, T}) where {T, L, M}
+    @assert L == M+1
+    @assert M > 0
+    μ = zeros(T, N)
+    for k in 1:M
+        μ .+= w[k]*(chebyshevmoments1(T, N, a[k+1]) - chebyshevmoments1(T, N, a[k]))
+    end
+    μ
+end
+
 """
 Modified Chebyshev moments of the first kind with respect to the Jacobi weight:
 
@@ -282,22 +319,57 @@ end
 Modified Chebyshev moments of the first kind with respect to the logarithmic weight:
 
 ```math
-    \\int_{-1}^{+1} T_n(x) \\log\\left(\\frac{1-x}{2}\\right){\\rm\\,d}x.
+    \\int_{-1}^{+1} T_n(x) \\log\\left(\\frac{2}{1-x}\\right){\\rm\\,d}x.
 ```
 """
 function chebyshevlogmoments1(::Type{T}, N::Int) where T
     μ = zeros(T, N)
-    N > 0 && (μ[1] = -two(T))
+    N > 0 && (μ[1] = two(T))
     if N > 1
-        μ[2] = -one(T)
+        μ[2] = one(T)
         for i=1:N-2
-            cst = isodd(i) ? T(4)/T(i^2-4) : T(4)/T(i^2-1)
+            cst = isodd(i) ? T(4)/T(4-i^2) : T(4)/T(1-i^2)
             @inbounds μ[i+2] = ((i-2)*μ[i]+cst)/(i+2)
         end
     end
     μ
 end
 
+"""
+Modified Chebyshev moments of the first kind with respect to the log-Chebyshev weight:
+
+```math
+    \\int_{-1}^{+1} T_n(x) \\log\\left(\\frac{2}{1-x}\\right)\\frac{{\\rm d}x}{\\sqrt{1-x^2}}.
+```
+"""
+function chebyshevlogchebyshevmoments1(::Type{T}, N::Int) where T
+    μ = zeros(T, N)
+    N > 0 && (μ[1] = 2*log(T(2))*π)
+    if N > 1
+        for i=1:N-1
+            @inbounds μ[i+1] = T(π)/i
+        end
+    end
+    μ
+end
+
+"""
+Modified Chebyshev moments of the first kind with respect to the absolute value weight:
+
+```math
+    \\int_{-1}^{+1} T_n(x) |x|{\\rm\\,d}x.
+```
+"""
+function chebyshevabsmoments1(::Type{T}, N::Int) where T
+    μ = zeros(T, N)
+    if N > 0
+        for i=0:4:N-1
+            @inbounds μ[i+1] = -T(1)/T((i÷2)^2-1)
+        end
+    end
+    μ
+end
+
 """
 Modified Chebyshev moments of the second kind:
 
@@ -336,7 +408,7 @@ end
 Modified Chebyshev moments of the second kind with respect to the logarithmic weight:
 
 ```math
-    \\int_{-1}^{+1} U_n(x) \\log\\left(\\frac{1-x}{2}\\right){\\rm\\,d}x.
+    \\int_{-1}^{+1} U_n(x) \\log\\left(\\frac{2}{1-x}\\right){\\rm\\,d}x.
 ```
 """
 function chebyshevlogmoments2(::Type{T}, N::Int) where T
@@ -350,6 +422,23 @@ function chebyshevlogmoments2(::Type{T}, N::Int) where T
     μ
 end
 
+"""
+Modified Chebyshev moments of the second kind with respect to the absolute value weight:
+
+```math
+    \\int_{-1}^{+1} U_n(x) |x|{\\rm\\,d}x.
+```
+"""
+function chebyshevabsmoments2(::Type{T}, N::Int) where T
+    μ = chebyshevabsmoments1(T, N)
+    if N > 1
+        μ[2] *= two(T)
+        for i=1:N-2
+            @inbounds μ[i+2] = 2μ[i+2] + μ[i]
+        end
+    end
+    μ
+end
 
 function sphrand(::Type{T}, m::Int, n::Int) where T
     A = zeros(T, m, n)
@@ -551,6 +640,11 @@ end
 
 trizeros(::Type{T}, m::Int, n::Int) where T = zeros(T, m, n)
 
+const rectdiskrand = trirand
+const rectdiskrandn = trirandn
+const rectdiskones = triones
+const rectdiskzeros = trizeros
+
 """
 Pointwise evaluation of triangular harmonic:
 
@@ -605,3 +699,50 @@ function tetones(::Type{T}, l::Int, m::Int, n::Int) where T
 end
 
 tetzeros(::Type{T}, l::Int, m::Int, n::Int) where T = zeros(T, l, m, n)
+
+function spinsphrand(::Type{T}, m::Int, n::Int, s::Int) where T
+    A = zeros(T, m, n)
+    as = abs(s)
+    for i = 1:m-as
+        A[i,1] = rand(T)
+    end
+    for j = 1:n÷2
+        for i = 1:m-max(j, as)
+            A[i,2j] = rand(T)
+            A[i,2j+1] = rand(T)
+        end
+    end
+    A
+end
+
+function spinsphrandn(::Type{T}, m::Int, n::Int, s::Int) where T
+    A = zeros(T, m, n)
+    as = abs(s)
+    for i = 1:m-as
+        A[i,1] = randn(T)
+    end
+    for j = 1:n÷2
+        for i = 1:m-max(j, as)
+            A[i,2j] = randn(T)
+            A[i,2j+1] = randn(T)
+        end
+    end
+    A
+end
+
+function spinsphones(::Type{T}, m::Int, n::Int, s::Int) where T
+    A = zeros(T, m, n)
+    as = abs(s)
+    for i = 1:m-as
+        A[i,1] = one(T)
+    end
+    for j = 1:n÷2
+        for i = 1:m-max(j, as)
+            A[i,2j] = one(T)
+            A[i,2j+1] = one(T)
+        end
+    end
+    A
+end
+
+spinsphzeros(::Type{T}, m::Int, n::Int) where T = zeros(T, m, n)
diff --git a/src/toeplitzhankel.jl b/src/toeplitzhankel.jl
new file mode 100644
index 00000000..1c552a09
--- /dev/null
+++ b/src/toeplitzhankel.jl
@@ -0,0 +1,734 @@
+"""
+Represent a scaled Toeplitz∘Hankel matrix:
+
+    DL(T∘H)DR
+
+where the Hankel matrix `H` is non-negative definite, via
+
+    ∑_{k=1}^r Diagonal(L[:,k])*T*Diagonal(R[:,k])
+
+where `L` and `R` are determined by doing a rank-r pivoted Cholesky decomposition of `H`, which in low rank form is
+
+    H ≈ ∑_{k=1}^r C[:,k]C[:,k]'
+
+so that `L[:,k] = DL*C[:,k]` and `R[:,k] = DR*C[:,k]`.
+
+This allows a Cholesky decomposition in 𝒪(K²N) operations and 𝒪(KN) storage, K = log N log ɛ⁻¹.
+The tuple storage allows plans applied to each dimension.
+"""
+struct ToeplitzHankelPlan{S, N, N1, LowR, TP, Dims} <: Plan{S}
+    T::TP # A length M Vector or Tuple of ToeplitzPlan
+    L::LowR  # A length M Vector or Tuple of Matrices storing low rank factors of L
+    R::LowR # A length M Vector or Tuple of Matrices storing low rank factors of R
+    tmp::Array{S,N1} # A larger dimensional array to transform each scaled array all-at-once
+    dims::Dims # A length M Vector or Tuple of Int storing the dimensions acted on
+    function ToeplitzHankelPlan{S,N,N1,LowR,TP,Dims}(T::TP, L::LowR, R::LowR, dims) where {S,N,N1,LowR,TP,Dims}
+        tmp = Array{S}(undef, max.(size.(T)...)...)
+        new{S,N,N1,LowR,TP,Dims}(T, L, R, tmp, dims)
+    end
+end
+
+
+ToeplitzHankelPlan{S,N,M}(T::TP, L::LowR, R::LowR, dims::Dims) where {S,N,M,LowR,TP,Dims} = ToeplitzHankelPlan{S,N,M,LowR,TP,Dims}(T, L, R, dims)
+ToeplitzHankelPlan{S,N}(T, L, R, dims) where {S,N} = ToeplitzHankelPlan{S,N,N+1}(T, L, R, dims)
+ToeplitzHankelPlan(T::ToeplitzPlan{S,M}, L::Matrix, R::Matrix, dims=1) where {S,M} = ToeplitzHankelPlan{S,M-1,M}((T,), (L,), (R,), dims)
+
+size(TH::ToeplitzHankelPlan) = size(first(TH.T))
+
+
+_reshape_broadcast(d, R, ::Val{N}, M) where N = reshape(R,ntuple(k -> k == d ? size(R,1) : 1, Val(N))...,M)
+function _th_applymul!(d, v::AbstractArray{<:Any,N}, T, L, R, tmp) where N
+    M = size(R,2)
+    ax = (axes(v)..., OneTo(M))
+    tmp[ax...] .=  _reshape_broadcast(d, R, Val(N), M) .* v
+    T * view(tmp, ax...)
+    view(tmp,ax...) .*= _reshape_broadcast(d, L, Val(N), M)
+    sum!(v, view(tmp,ax...))
+end
+
+
+function *(P::ToeplitzHankelPlan{<:Any,N}, v::AbstractArray{<:Any,N}) where N
+    for (R,L,T,d) in zip(P.R,P.L,P.T,P.dims)
+        _th_applymul!(d, v, T, L, R, P.tmp)
+    end
+    v
+end
+
+*(P::ToeplitzHankelPlan, v::AbstractArray) = error("plan applied to wrong-sized array")
+
+
+# partial cholesky for a Hankel matrix
+
+function hankel_partialchol(v::Vector{T}) where T
+    # Assumes positive definite
+    σ = T[]
+    n = isempty(v) ? 0 : (length(v)+2) ÷ 2
+    C = Matrix{T}(undef, n, n)
+    d = v[1:2:end] # diag of H
+    @assert length(v) ≥ 2n-1
+    reltol = maximum(abs,d)*eps(T)*log(n)
+    r = 0
+    for k = 1:n
+        mx,idx = findmax(d)
+        if mx ≤ reltol break end
+        push!(σ, inv(mx))
+        C[:,k] .= view(v,idx:n+idx-1)
+        for j = 1:k-1
+            nCjidxσj = -C[idx,j]*σ[j]
+            LinearAlgebra.axpy!(nCjidxσj, view(C,:,j), view(C,:,k))
+        end
+        @inbounds for p=1:n
+            d[p] -= C[p,k]^2/mx
+        end
+        r += 1
+    end
+    for k=1:length(σ) rmul!(view(C,:,k), sqrt(σ[k])) end
+    C[:,1:r]
+end
+
+# cholesky for D .* H .* D'
+function hankel_partialchol(v::Vector, D::AbstractVector)
+    T = promote_type(eltype(v), eltype(D))
+    # Assumes positive definite
+    σ = T[]
+    n = isempty(v) ? 0 : (length(v)+2) ÷ 2
+    C = Matrix{T}(undef, n, 100)
+    d = v[1:2:end] .* D.^2 # diag of D .* H .* D'
+    @assert length(v) ≥ 2n-1
+    reltol = maximum(abs,d)*eps(T)*log(n)
+    r = 0
+    for k = 1:n
+        mx,idx = findmax(d)
+        if mx ≤ reltol break end
+        push!(σ, inv(mx))
+        C[:,k] .= view(v,idx:n+idx-1) .*D.*D[idx]
+        for j = 1:k-1
+            nCjidxσj = -C[idx,j]*σ[j]
+            LinearAlgebra.axpy!(nCjidxσj, view(C,:,j), view(C,:,k))
+        end
+        @inbounds for p=1:n
+            d[p] -= C[p,k]^2/mx
+        end
+        r += 1
+    end
+    r == 100 && error("ranks more than 100 not yet supported")
+    for k=1:length(σ) rmul!(view(C,:,k), sqrt(σ[k])) end
+    C[:,1:r]
+end
+
+
+
+# Diagonally-scaled Toeplitz∘Hankel polynomial transforms
+
+
+
+struct ChebyshevToLegendrePlanTH{S,TH<:ToeplitzHankelPlan{S}} <: Plan{S}
+    toeplitzhankel::TH
+end
+
+function *(P::ChebyshevToLegendrePlanTH, v::AbstractVector{S}) where S
+    n = length(v)
+    ret = zero(S)
+    @inbounds for k = 1:2:n
+        ret += -v[k]/(k*(k-2))
+    end
+    v[1] = ret
+    P.toeplitzhankel*view(v,2:n)
+    v
+end
+
+function _cheb2leg_rescale1!(V::AbstractArray{S}, Rpre, Rpost, d) where S
+    m = size(V,d)
+    for Ipost in Rpost, Ipre in Rpre
+        ret = zero(S)
+        @inbounds for k = 1:2:m
+            ret += -V[Ipre,k,Ipost]/(k*(k-2))
+        end
+        V[Ipre,1,Ipost] = ret
+    end
+    V
+end
+
+_dropfirstdim(d::Int) = ()
+_dropfirstdim(d::Int, m, szs...) = ((d == 1 ? 2 : 1):m, _dropfirstdim(d-1, szs...)...)
+
+function *(P::ChebyshevToLegendrePlanTH, V::AbstractArray)
+    m,n = size(V)
+    tmp = P.toeplitzhankel.tmp
+    for (d,R,L,T) in zip(P.toeplitzhankel.dims,P.toeplitzhankel.R,P.toeplitzhankel.L,P.toeplitzhankel.T)
+        Rpre = CartesianIndices(axes(V)[1:d-1])
+        Rpost = CartesianIndices(axes(V)[d+1:end])
+        _cheb2leg_rescale1!(V, Rpre, Rpost, d)
+        _th_applymul!(d, view(V, _dropfirstdim(d, size(V)...)...), T, L, R, tmp)
+    end
+    V
+end
+
+_add1tod(d::Integer, a, b...) = d == 1 ? (a+1, b...) : (a, _add1tod(d-1, b...)...)
+_add1tod(d, a, b...) = _add1tod(first(d), a, b...)
+size(P::ChebyshevToLegendrePlanTH) = Base.front(_add1tod(P.toeplitzhankel.dims, size(first(P.toeplitzhankel.T))...))
+inv(P::ChebyshevToLegendrePlanTH{T}) where T = plan_th_leg2cheb!(T, size(P), P.toeplitzhankel.dims)
+
+
+function _leg2chebTH_TLC(::Type{S}, mn, d) where S
+    n = mn[d]
+    λ = Λ.(0:half(real(S)):n-1)
+    t = zeros(S,n)
+    t[1:2:end] .= 2 .* view(λ, 1:2:n) ./ π
+    C = hankel_partialchol(λ)
+    T = plan_uppertoeplitz!(t, (mn..., size(C,2)), d)
+    L = copy(C)
+    L[1,:] ./= 2
+    T,L,C
+end
+
+function _leg2chebuTH_TLC(::Type{S}, mn, d) where {S}
+    n = mn[d]
+    S̃ = real(S)
+    λ = Λ.(0:half(S̃):n-1)
+    t = zeros(S,n)
+    t[1:2:end] = λ[1:2:n]./(((1:2:n).-2))
+    h = λ./((1:2n-1).+1)
+    C = hankel_partialchol(h)
+    T = plan_uppertoeplitz!(-2t/π, (mn..., size(C,2)), d)
+    (T, (1:n) .* C, C)
+end
+
+for f in (:leg2cheb, :leg2chebu)
+    plan = Symbol("plan_th_", f, "!")
+    TLC = Symbol("_", f, "TH_TLC")
+    @eval begin
+        $plan(::Type{S}, mn::NTuple{N,Int}, dims::Int) where {S,N} = ToeplitzHankelPlan($TLC(S, mn, dims)..., dims)
+        function $plan(::Type{S}, mn::NTuple{N,Int}, dims) where {S,N}
+            TLCs = $TLC.(S, Ref(mn), dims)
+            ToeplitzHankelPlan{S,N}(map(first, TLCs), map(TLC -> TLC[2], TLCs), map(last, TLCs), dims)
+        end
+    end
+end
+
+###
+# th_cheb2leg
+###
+
+_sub_dim_by_one(d) = ()
+_sub_dim_by_one(d, m, n...) = (isone(d) ? m-1 : m, _sub_dim_by_one(d-1, n...)...)
+
+function _cheb2legTH_TLC(::Type{S}, mn, d) where S
+    n = mn[d]
+    t = zeros(S,n-1)
+    S̃ = real(S)
+    if n > 1
+        t[1:2:end] = Λ.(0:one(S̃):div(n-2,2), -half(S̃), one(S̃))
+    end
+    h = Λ.(1:half(S̃):n-1, zero(S̃), 3half(S̃))
+    D = 1:n-1
+    DL = (3half(S̃):n-half(S̃)) ./ D
+    DR = -(one(S̃):n-one(S̃)) ./ (4 .* D)
+    C = hankel_partialchol(h, D)
+    T = plan_uppertoeplitz!(t, (_sub_dim_by_one(d, mn...)..., size(C,2)), d)
+    T, DL .* C, DR .* C
+end
+
+plan_th_cheb2leg!(::Type{S}, mn::NTuple{N,Int}, dims::Int) where {S,N} = ChebyshevToLegendrePlanTH(ToeplitzHankelPlan(_cheb2legTH_TLC(S, mn, dims)..., dims))
+
+function plan_th_cheb2leg!(::Type{S}, mn::NTuple{N,Int}, dims) where {S,N}
+    TLCs = _cheb2legTH_TLC.(S, Ref(mn), dims)
+    ChebyshevToLegendrePlanTH(ToeplitzHankelPlan{S,N}(map(first, TLCs), map(TLC -> TLC[2], TLCs), map(last, TLCs), dims))
+end
+
+
+###
+# th_ultra2ultra
+###
+
+# The second case handles zero
+isapproxinteger(::Integer) = true
+isapproxinteger(x) = isinteger(x) || x ≈ round(Int,x)  || x+1 ≈ round(Int,x+1)
+
+"""
+  _nearest_jacobi_par(α, γ)
+
+returns a number that is an integer different than γ but less than 1 away from α.
+"""
+function _nearest_jacobi_par(α::T, γ::T) where T
+    ret = isapproxinteger(α-γ) ? α : round(Int,α,RoundDown) + mod(γ,1)
+    ret ≤ -1 ? ret + 1 : ret
+end
+_nearest_jacobi_par(α::T, ::T) where T<:Integer = α
+_nearest_jacobi_par(α, γ) = _nearest_jacobi_par(promote(α,γ)...)
+
+
+struct Ultra2UltraPlanTH{T, Plans, Dims} <: Plan{T}
+    plans::Plans
+    λ₁::T
+    λ₂::T
+    dims::Dims
+end
+
+function *(P::Ultra2UltraPlanTH, A::AbstractArray)
+    ret = A
+    if isapproxinteger(P.λ₂ - P.λ₁)
+        _ultra2ultra_integerinc!(ret, P.λ₁, P.λ₂, P.dims)
+    else
+        for p in P.plans
+            ret = p*ret
+        end
+        c = _nearest_jacobi_par(P.λ₁, P.λ₂)
+
+        _ultra2ultra_integerinc!(ret, c, P.λ₂, P.dims)
+    end
+end
+
+function _ultra2ultraTH_TLC(::Type{S}, mn, λ₁, λ₂, d) where {S}
+    n = mn[d]
+    @assert abs(λ₁-λ₂) < 1
+    S̃ = real(S)
+    DL = (zero(S̃):n-one(S̃)) .+ λ₂
+    jk = 0:half(S̃):n-1
+    t = zeros(S,n)
+    t[1:2:n] = Λ.(jk,λ₁-λ₂,one(S̃))[1:2:n]
+    h = Λ.(jk,λ₁,λ₂+one(S̃))
+    lmul!(gamma(λ₂)/gamma(λ₁),h)
+    C = hankel_partialchol(h)
+    T = plan_uppertoeplitz!(lmul!(inv(gamma(λ₁-λ₂)),t), (mn..., size(C,2)), d)
+    T, DL .* C, C
+end
+
+_good_plan_th_ultra2ultra!(::Type{S}, mn, λ₁, λ₂, dims::Int) where S = ToeplitzHankelPlan(_ultra2ultraTH_TLC(S, mn, λ₁, λ₂, dims)..., dims)
+
+function _good_plan_th_ultra2ultra!(::Type{S}, mn::NTuple{2,Int}, λ₁, λ₂, dims::NTuple{2,Int}) where S
+    T1,L1,C1 = _ultra2ultraTH_TLC(S, mn, λ₁, λ₂, 1)
+    T2,L2,C2 = _ultra2ultraTH_TLC(S, mn, λ₁, λ₂, 2)
+    ToeplitzHankelPlan{S,2}((T1,T2), (L1,L2), (C1,C2), dims)
+end
+
+
+
+function plan_th_ultra2ultra!(::Type{S}, mn, λ₁, λ₂, dims) where {S}
+    c = _nearest_jacobi_par(λ₁, λ₂)
+
+    if isapproxinteger(λ₂ - λ₁)
+        # TODO: don't make extra plan
+        plans = typeof(_good_plan_th_ultra2ultra!(S, mn, λ₂+0.1, λ₂, dims))[]
+    else
+        plans = [_good_plan_th_ultra2ultra!(S, mn, λ₁, c, dims)]
+    end
+
+    Ultra2UltraPlanTH(plans, λ₁, λ₂, dims)
+end
+
+function _ultra_raise!(B, λ)
+    m, n = size(B, 1), size(B, 2)
+
+    if m > 1
+        @inbounds for j = 1:n
+            for i = 1:m-2
+                Bij = λ / (i+λ-1) * B[i,j]
+                Bij += -λ / (i+λ+1) * B[i+2,j]
+                B[i,j] = Bij
+            end
+            B[m-1,j] = λ / (m+λ-2)*B[m-1,j]
+            B[m,j] = λ / (m+λ-1)*B[m,j]
+        end
+    end
+    B
+end
+
+function _ultra_lower!(B, λ)
+    m, n = size(B, 1), size(B, 2)
+
+    if m > 1
+        @inbounds for j = 1:n
+            B[m,j] = (m+λ-1)/λ * B[m,j]
+            B[m-1,j] = (m+λ-2)/λ *B[m-1,j]
+            for i = m-2:-1:1
+                Bij = B[i,j] + λ / (i+λ+1) * B[i+2,j]
+                B[i,j] = (i+λ-1)/λ * Bij
+            end  
+        end
+    end
+    B
+end
+
+
+
+function _ultra_raise!(x, λ, dims)
+    for d in dims
+        if d == 1
+            _ultra_raise!(x, λ)
+        else
+            _ultra_raise!(x', λ)
+        end
+    end
+    x
+end
+
+function _ultra_lower!(x, λ, dims)
+    for d in dims
+        if d == 1
+            _ultra_lower!(x, λ-1)
+        else
+            _ultra_lower!(x', λ-1)
+        end
+    end
+    x
+end
+
+function _ultra2ultra_integerinc!(x, λ₁, λ₂, dims)
+    while !(λ₁ ≈ λ₂)
+        if λ₂ > λ₁
+            _ultra_raise!(x, λ₁, dims)
+            λ₁ += 1
+        else
+            _ultra_lower!(x, λ₁, dims)
+            λ₁ -= 1
+        end
+    end
+    x
+end
+
+###
+# th_jac2jac
+###
+
+
+function _lmul!(A::Bidiagonal, B::AbstractVecOrMat)
+    @assert A.uplo == 'U'
+    
+    m, n = size(B, 1), size(B, 2)
+    if m != size(A, 1)
+        throw(DimensionMismatch("right hand side B needs first dimension of size $(size(A,1)), has size $m"))
+    end
+    @inbounds for j = 1:n
+        for i = 1:m-1
+            Bij = A.dv[i]*B[i,j]
+            Bij += A.ev[i]*B[i+1,j]
+            B[i,j] = Bij
+        end
+        B[m,j] = A.dv[m]*B[m,j]
+    end
+    B
+end
+
+struct Jac2JacPlanTH{T, Plans, Dims} <: Plan{T}
+    plans::Plans
+    α::T
+    β::T
+    γ::T
+    δ::T
+    dims::Dims
+end
+
+Jac2JacPlanTH(plans, α, β, γ, δ, dims) = Jac2JacPlanTH(plans, promote(α, β, γ, δ)..., dims)
+
+function *(P::Jac2JacPlanTH, A::AbstractArray)
+    if P.α + P.β ≤ -1
+        _jacobi_raise_a!(A, P.α, P.β, P.dims)
+        c,d = _nearest_jacobi_par(P.α+1, P.γ), _nearest_jacobi_par(P.β, P.δ)
+    else
+        c,d = _nearest_jacobi_par(P.α, P.γ), _nearest_jacobi_par(P.β, P.δ)
+    end
+
+    ret = A
+    for p in P.plans
+        ret = p*ret
+    end
+
+    _jac2jac_integerinc!(ret, c, d, P.γ, P.δ, P.dims)
+end
+
+function alternatesign!(v)
+    @inbounds for k = 2:2:length(v)
+        v[k] = -v[k]
+    end
+    v
+end
+
+function _jac2jacTH_TLC(::Type{S}, mn, α, β, γ, δ, d) where {S}
+    n = mn[d]
+    @assert α+β > -1
+    if β == δ
+        @assert abs(α-γ) < 1
+        jk = 0:n-1
+        DL = (2jk .+ γ .+ β .+ 1).*Λ.(jk,γ+β+1,β+1)
+        t = convert(AbstractVector{S}, Λ.(jk, α-γ,1))
+        h = Λ.(0:2n-2,α+β+1,γ+β+2)
+        DR = Λ.(jk,β+1,α+β+1)./gamma(α-γ)
+        C = hankel_partialchol(h)
+        T = plan_uppertoeplitz!(t, (mn..., size(C,2)), d)
+    elseif α == γ
+        @assert abs(β-δ) < 1
+        jk = 0:n-1
+        DL = (2jk .+ δ .+ α .+ 1).*Λ.(jk,δ+α+1,α+1)
+        h = Λ.(0:2n-2,α+β+1,δ+α+2)
+        DR = Λ.(jk,α+1,α+β+1)./gamma(β-δ)
+        t = alternatesign!(convert(AbstractVector{S}, Λ.(jk,β-δ,1)))
+        C = hankel_partialchol(h)
+        T = plan_uppertoeplitz!(t, (mn..., size(C,2)), d)
+    else
+        throw(ArgumentError("Cannot create Toeplitz dot Hankel, use a sequence of plans."))
+    end
+
+    (T, DL .* C, DR .* C)
+end
+
+_good_plan_th_jac2jac!(::Type{S}, mn, α, β, γ, δ, dims::Int) where S = ToeplitzHankelPlan(_jac2jacTH_TLC(S, mn, α, β, γ, δ, dims)..., dims)
+
+function _good_plan_th_jac2jac!(::Type{S}, mn::NTuple{2,Int}, α, β, γ, δ, dims::NTuple{2,Int}) where S
+    T1,L1,C1 = _jac2jacTH_TLC(S, mn, α, β, γ, δ, 1)
+    T2,L2,C2 = _jac2jacTH_TLC(S, mn, α, β, γ, δ, 2)
+    ToeplitzHankelPlan{S,2}((T1,T2), (L1,L2), (C1,C2), dims)
+end
+
+
+
+function plan_th_jac2jac!(::Type{S}, mn, α, β, γ, δ, dims) where {S}
+    if α + β ≤ -1
+        c,d = _nearest_jacobi_par(α+1, γ), _nearest_jacobi_par(β, δ)
+    else
+        c,d = _nearest_jacobi_par(α, γ), _nearest_jacobi_par(β, δ)
+    end
+
+    if isapproxinteger(β - δ) && isapproxinteger(α-γ)
+        # TODO: don't make extra plan
+        plans = typeof(_good_plan_th_jac2jac!(S, mn, α+0.1, β, α, β, dims))[]
+    elseif isapproxinteger(α - γ) || isapproxinteger(β - δ)
+        if α + β ≤ -1
+            # avoid degenerecies
+            plans = [_good_plan_th_jac2jac!(S, mn, α+1, β, c, d, dims)]
+        else
+            plans = [_good_plan_th_jac2jac!(S, mn, α, β, c, d, dims)]
+        end
+    else
+        if α + β ≤ -1
+            plans = [_good_plan_th_jac2jac!(S, mn, α+1, β, α+1, d, dims), _good_plan_th_jac2jac!(S, mn, α+1, d, c, d, dims)]
+        else
+            plans = [_good_plan_th_jac2jac!(S, mn, α, β, α, d, dims), _good_plan_th_jac2jac!(S, mn, α, d, c, d, dims)]
+        end
+    end
+
+    Jac2JacPlanTH(plans, α, β, γ, δ, dims)
+end
+
+
+function _jacobi_raise_a!(B, a, b)
+    m, n = size(B, 1), size(B, 2)
+    if m > 1
+        @inbounds for j = 1:n
+            B[1,j] = B[1,j] - (1+b) / (a+b+3) * B[2,j]
+            for i = 2:m-1
+                B[i,j] = (i+a+b)/(a+b-1+2i) * B[i,j] - (i+b) / (a+b+2i+1) * B[i+1,j]
+            end
+            B[m,j] = (m+a+b)/(a+b-1+2m)*B[m,j]
+        end
+    end
+    B
+end
+
+function _jacobi_lower_a!(B, a, b)
+    m, n = size(B, 1), size(B, 2)
+
+    if m > 1
+        @inbounds for j = 1:n
+            B[m,j] = (a+b-1+2m)/(m+a+b) * B[m,j]
+            for i = m-1:-1:2
+                Bij = B[i,j] + (i+b) / (a+b+2i+1) * B[i+1,j]
+                B[i,j] = (a+b-1+2i)/(i+a+b)  * Bij
+            end
+            B[1,j] = B[1,j] + (1+b) / (a+b+3) * B[2,j]
+        end
+    end
+    B
+end
+
+
+
+function _jacobi_raise_b!(B, a, b)
+    m, n = size(B, 1), size(B, 2)
+    if m > 1
+        @inbounds for j = 1:n
+            B[1,j] = B[1,j] + (1+a) / (a+b+3) * B[2,j]
+            
+            for i = 2:m-1
+                B[i,j] = (i+a+b)/(a+b-1+2i) * B[i,j] + (i+a) / (a+b+2i+1) * B[i+1,j]
+            end
+            B[m,j] = (m+a+b)/(a+b-1+2m)*B[m,j]
+        end
+    end
+    B
+end
+
+function _jacobi_lower_b!(B, a, b)
+    m, n = size(B, 1), size(B, 2)
+
+    if m > 1
+        @inbounds for j = 1:n
+            B[m,j] = (a+b-1+2m)/(m+a+b) * B[m,j]
+            for i = m-1:-1:2
+                Bij = B[i,j] - (i+a) / (a+b+2i+1) * B[i+1,j]
+                B[i,j] = (a+b-1+2i)/(i+a+b)  * Bij
+            end
+            B[1,j] = B[1,j] - (1+a) / (a+b+3) * B[2,j]
+        end
+    end
+    B
+end
+
+
+
+function _jacobi_raise_b!(x, α, β, dims)
+    for d in dims
+        if d == 1
+            _jacobi_raise_b!(x, α, β)
+        else
+            _jacobi_raise_b!(x', α, β)
+        end
+    end
+    x
+end
+function _jacobi_raise_a!(x, α, β, dims)
+    for d in dims
+        if d == 1
+            _jacobi_raise_a!(x, α, β)
+        else
+            _jacobi_raise_a!(x', α, β)
+        end
+    end
+    x
+end
+
+function _jacobi_lower_b!(x, α, β, dims)
+    for d in dims
+        if d == 1
+            _jacobi_lower_b!(x, α, β-1)
+        else
+            _jacobi_lower_b!(x', α, β-1)
+        end
+    end
+    x
+end
+function _jacobi_lower_a!(x, α, β, dims)
+    for d in dims
+        if d == 1
+            _jacobi_lower_a!(x, α-1, β)
+        else
+            _jacobi_lower_a!(x', α-1, β)
+        end
+    end
+    x
+end
+
+
+function _jac2jac_integerinc!(x, α, β, γ, δ, dims)
+    while !(α ≈ γ && β ≈ δ)
+        if !(δ ≈ β) && δ > β
+            _jacobi_raise_b!(x, α, β, dims)
+            β += 1
+        elseif !(δ ≈ β) && δ < β
+            _jacobi_lower_b!(x, α, β, dims)
+            β -= 1
+        elseif !(γ ≈ α) && γ > α
+            _jacobi_raise_a!(x, α, β, dims)
+            α += 1
+        else
+            @assert γ < α
+            _jacobi_lower_a!(x, α, β, dims)
+            α -= 1
+        end
+    end
+    x
+end
+
+
+###
+# other routines
+###
+
+for f in (:th_leg2cheb, :th_cheb2leg, :th_leg2chebu)
+    plan = Symbol("plan_", f, "!")
+    @eval begin
+        $plan(arr::AbstractArray{T}, dims...) where T = $plan(T, size(arr), dims...)
+        $plan(::Type{S}, mn::NTuple{N,Int}) where {S,N} = $plan(S, mn, ntuple(identity,Val(N)))
+        $f(v, dims...) = $plan(eltype(v), size(v), dims...)*copy(v)
+    end
+end
+
+plan_th_ultra2ultra!(::Type{S}, mn::NTuple{N,Int}, λ₁, λ₂, dims::UnitRange) where {N,S} = plan_th_ultra2ultra!(S, mn, λ₁, λ₂, tuple(dims...))
+plan_th_ultra2ultra!(::Type{S}, mn::Tuple{Int}, λ₁, λ₂, dims::Tuple{Int}=(1,)) where {S} = plan_th_ultra2ultra!(S, mn, λ₁, λ₂, dims...)
+plan_th_ultra2ultra!(::Type{S}, (m,n)::NTuple{2,Int}, λ₁, λ₂) where {S} = plan_th_ultra2ultra!(S, (m,n), λ₁, λ₂, (1,2))
+plan_th_ultra2ultra!(arr::AbstractArray{T}, λ₁, λ₂, dims...) where T = plan_th_ultra2ultra!(T, size(arr), λ₁, λ₂, dims...)
+th_ultra2ultra(v, λ₁, λ₂, dims...) = plan_th_ultra2ultra!(eltype(v), size(v), λ₁, λ₂, dims...)*copy(v)
+
+plan_th_jac2jac!(::Type{S}, mn::NTuple{N,Int}, α, β, γ, δ, dims::UnitRange) where {N,S} = plan_th_jac2jac!(S, mn, α, β, γ, δ, tuple(dims...))
+plan_th_jac2jac!(::Type{S}, mn::Tuple{Int}, α, β, γ, δ, dims::Tuple{Int}=(1,)) where {S} = plan_th_jac2jac!(S, mn, α, β, γ, δ, dims...)
+plan_th_jac2jac!(::Type{S}, (m,n)::NTuple{2,Int}, α, β, γ, δ) where {S} = plan_th_jac2jac!(S, (m,n), α, β, γ, δ, (1,2))
+plan_th_jac2jac!(arr::AbstractArray{T}, α, β, γ, δ, dims...) where T = plan_th_jac2jac!(T, size(arr), α, β, γ, δ, dims...)
+th_jac2jac(v, α, β, γ, δ, dims...) = plan_th_jac2jac!(eltype(v), size(v), α, β, γ, δ, dims...)*copy(v)
+
+
+####
+# cheb2jac
+####
+
+struct Cheb2JacPlanTH{T, Pl<:Jac2JacPlanTH{T}} <: Plan{T}
+    jac2jac::Pl
+end
+
+
+struct Jac2ChebPlanTH{T, Pl<:Jac2JacPlanTH{T}} <: Plan{T}
+    jac2jac::Pl
+end
+
+
+function jac_cheb_recurrencecoefficients(T, N)
+    n = 0:N
+    h = one(T)/2
+    A = (2n .+ one(T)) ./ (n .+ one(T))
+    A[1] /= 2
+    A, Zeros(n), 
+    ((n .- h) .* (n .- h) .* (2n .+ one(T))) ./ ((n .+ one(T)) .* n .* (2n .- one(T)))
+end
+
+
+function *(P::Cheb2JacPlanTH{T}, X::AbstractArray) where T
+    A,B,C = jac_cheb_recurrencecoefficients(T, max(size(X)...))
+
+    for d in P.jac2jac.dims
+        if d == 1
+            p = forwardrecurrence(size(X,1), A,B,C, one(T))
+            X .= p .\ X
+        else
+            @assert d == 2
+            n = size(X,2)
+            p = forwardrecurrence(size(X,2), A,B,C, one(T))
+            X .= X ./ transpose(p)
+        end
+    end
+    P.jac2jac*X
+end
+
+function *(P::Jac2ChebPlanTH{T}, X::AbstractArray) where T
+    X = P.jac2jac*X
+    A,B,C = jac_cheb_recurrencecoefficients(T, max(size(X)...))
+
+    for d in P.jac2jac.dims
+        if d == 1
+            p = forwardrecurrence(size(X,1), A,B,C, one(T))
+            X .= p .* X
+        else
+            @assert d == 2
+            n = size(X,2)
+            p = forwardrecurrence(size(X,2), A,B,C, one(T))
+            X .= X .* transpose(p)
+        end
+    end
+    X
+end
+
+plan_th_cheb2jac!(::Type{T}, mn, α, β, dims...) where T = Cheb2JacPlanTH(plan_th_jac2jac!(T, mn, -one(α)/2, -one(α)/2, α, β, dims...))
+plan_th_cheb2jac!(arr::AbstractArray{T}, α, β, dims...) where T = plan_th_cheb2jac!(T, size(arr), α, β, dims...)
+th_cheb2jac(v, α, β, dims...) = plan_th_cheb2jac!(eltype(v), size(v), α, β, dims...)*copy(v)
+
+plan_th_jac2cheb!(::Type{T}, mn, α, β, dims...) where T = Jac2ChebPlanTH(plan_th_jac2jac!(T, mn, α, β, -one(α)/2, -one(α)/2, dims...))
+plan_th_jac2cheb!(arr::AbstractArray{T}, α, β, dims...) where T = plan_th_jac2cheb!(T, size(arr), α, β, dims...)
+th_jac2cheb(v, α, β, dims...) = plan_th_jac2cheb!(eltype(v), size(v), α, β, dims...)*copy(v)
diff --git a/src/toeplitzplans.jl b/src/toeplitzplans.jl
new file mode 100644
index 00000000..9be77234
--- /dev/null
+++ b/src/toeplitzplans.jl
@@ -0,0 +1,111 @@
+using FFTW
+import FFTW: plan_r2r!
+
+
+"""
+    ToeplitzPlan
+
+applies Toeplitz matrices fast along each dimension.
+"""
+
+struct ToeplitzPlan{T, N, Dims, S, VECS, P<:Plan{S}, Pi<:Plan{S}} <: Plan{T}
+    vectors::VECS # Vector or Tuple of storage
+    tmp::Array{S,N}
+    dft::P
+    idft::Pi
+    dims::Dims
+end
+
+ToeplitzPlan{T}(v, tmp::Array{S,N}, dft::Plan{S}, idft::Plan{S}, dims) where {T,S,N} = ToeplitzPlan{T,N,typeof(dims),S,typeof(v),typeof(dft), typeof(idft)}(v, tmp, dft, idft, dims)
+
+
+divdimby2(d::Int, sz1, szs...) = isone(d) ? ((sz1 + 1) ÷ 2, szs...) : (sz1, divdimby2(d-1, szs...)...)
+muldimby2(d::Int, sz1, szs...) = isone(d) ? (max(0,2sz1 - 1), szs...) : (sz1, muldimby2(d-1, szs...)...)
+
+function toeplitzplan_size(dims, szs)
+    ret = szs
+    for d in dims
+        ret = divdimby2(d, ret...)
+    end
+    ret
+end
+
+function to_toeplitzplan_size(dims, szs)
+    ret = szs
+    for d in dims
+        ret = muldimby2(d, ret...)
+    end
+    ret
+end
+
+
+size(A::ToeplitzPlan) = toeplitzplan_size(A.dims, size(A.tmp))
+
+
+# based on ToeplitzMatrices.jl
+"""
+    maybereal(::Type{T}, x)
+
+Return real-valued part of `x` if `T` is a type of a real number, and `x` otherwise.
+"""
+maybereal(::Type, x) = x
+maybereal(::Type{<:Real}, x) = real(x)
+
+function *(A::ToeplitzPlan{T,N}, X::AbstractArray{T,N}) where {T,N}
+    vcs,Y,dft,idft,dims = A.vectors,A.tmp, A.dft,A.idft,A.dims
+
+    isempty(X) && return X
+
+    fill!(Y, zero(eltype(Y)))
+    copyto!(view(Y, axes(X)...), X)
+
+    # Fourier transform each dimension
+    dft * Y
+
+    # Multiply by a diagonal matrix along each dimension by permuting
+    # to first dimension
+    for (vc,d) in zip(vcs,dims)
+        applydim!(v -> v .= vc .* v, Y, d, :)
+    end
+
+    # Transform back
+    idft * Y
+
+    X .= maybereal.(T, view(Y, axes(X)...))
+    X
+end
+
+
+function uppertoeplitz_padvec(v::AbstractVector{T}) where T
+    n = length(v)
+    S = complex(float(T))
+    tmp = zeros(S, max(0,2n-1))
+    if n ≠ 0
+        tmp[1] = v[1]
+        copyto!(tmp, n+1, Iterators.reverse(v), 1, n-1)
+    end
+    tmp
+end
+
+safe_fft!(A) = isempty(A) ? A : fft!(A)
+
+uppertoeplitz_vecs(v, dims::AbstractVector, szs) = [safe_fft!(uppertoeplitz_padvec(v[1:szs[d]])) for d in dims]
+uppertoeplitz_vecs(v, dims::Tuple{}, szs) = ()
+uppertoeplitz_vecs(v, dims::Tuple, szs) = (safe_fft!(uppertoeplitz_padvec(v[1:szs[first(dims)]])), uppertoeplitz_vecs(v, tail(dims), szs)...)
+uppertoeplitz_vecs(v, d::Int, szs) = (safe_fft!(uppertoeplitz_padvec(v[1:szs[d]])),)
+
+
+# allow FFT to work by making sure tmp is non-empty
+safe_tmp(tmp::AbstractArray{<:Any,N}) where N = isempty(tmp) ? similar(tmp, ntuple(_ -> 1, Val(N))...) : tmp
+
+function plan_uppertoeplitz!(v::AbstractVector{T}, szs::NTuple{N,Int}, dim=ntuple(identity,Val(N))) where {T,N}
+    S = complex(float(T))
+    
+    tmp = zeros(S, to_toeplitzplan_size(dim, szs)...)
+    dft = plan_fft!(safe_tmp(tmp), dim)
+    idft = plan_ifft!(safe_tmp(similar(tmp)), dim)
+    
+    return ToeplitzPlan{float(T)}(uppertoeplitz_vecs(v, dim, szs), tmp, dft, idft, dim)
+end
+
+plan_uppertoeplitz!(v::AbstractVector{T}) where T = plan_uppertoeplitz!(v, size(v))
diff --git a/test/arraystests.jl b/test/arraystests.jl
new file mode 100644
index 00000000..55167a90
--- /dev/null
+++ b/test/arraystests.jl
@@ -0,0 +1,64 @@
+using FastTransforms, Test
+import FastTransforms: ArrayPlan, NDimsPlan
+
+@testset "Array transform"  begin
+    @testset "ArrayPlan" begin
+        c = randn(5,20,10)
+        F = plan_cheb2leg(c)
+        FT = ArrayPlan(F, c)
+
+        @test size(FT) == size(c)
+
+        f = similar(c);
+        for k in axes(c,3)
+            f[:,:,k] = (F*c[:,:,k])
+        end
+        @test f ≈ FT*c
+        @test c ≈ FT\f
+
+        F = plan_cheb2leg(Vector{Float64}(axes(c,2)))
+        FT = ArrayPlan(F, c, (2,))
+        for k in axes(c,3)
+            f[:,:,k] = (F*c[:,:,k]')'
+        end
+        @test f ≈ FT*c
+        @test c ≈ FT\f
+    end
+
+    @testset "NDimsPlan" begin
+        c = randn(20,10,20)
+        @test_throws ErrorException("Different size in dims axes not yet implemented in N-dimensional transform.") NDimsPlan(ArrayPlan(plan_cheb2leg(c), c), size(c), (1,2))        
+
+        c = randn(5,20)
+        F = plan_cheb2leg(c)
+        FT = ArrayPlan(F, c)
+        P = NDimsPlan(F, size(c), (1,))
+        @test F*c ≈ FT*c ≈ P*c
+
+        c = randn(20,20,5);
+        F = plan_cheb2leg(c)
+        FT = ArrayPlan(F, c)
+        P = NDimsPlan(FT, size(c), (1,2))
+
+        @test size(P) == size(c)
+
+        f = similar(c);
+        for k in axes(f,3)
+            f[:,:,k] = (F*(F*c[:,:,k])')'
+        end
+        @test f ≈ P*c
+        @test c ≈ P\f
+
+        c = randn(5,10,10,60)
+        F = plan_cheb2leg(randn(10))
+        P = NDimsPlan(F, size(c), (2,3))
+        f = similar(c)
+        for i in axes(f,1), j in axes(f,4)
+            f[i,:,:,j] = (F*(F*c[i,:,:,j])')'
+        end
+        @test f ≈ P*c
+        @test c ≈ P\f
+    end
+end
+
+
diff --git a/test/chebyshevtests.jl b/test/chebyshevtests.jl
index 8855a405..763ac3ce 100644
--- a/test/chebyshevtests.jl
+++ b/test/chebyshevtests.jl
@@ -2,153 +2,491 @@ using FastTransforms, Test
 
 @testset "Chebyshev transform"  begin
     @testset "Chebyshev points" begin
-        @test chebyshevpoints(10; kind=1) == chebyshevpoints(Float64, 10; kind=1)
-        @test chebyshevpoints(10; kind=2) == chebyshevpoints(Float64, 10; kind=2)
+        @test @inferred(chebyshevpoints(10)) == @inferred(chebyshevpoints(Float64, 10))
+        @test @inferred(chebyshevpoints(10, Val(2))) == @inferred(chebyshevpoints(Float64, 10, Val(2)))
         for T in (Float32, Float64, ComplexF32, ComplexF64)
-            @test chebyshevpoints(T, 0, kind=1) == chebyshevpoints(T, 0, kind=2) == T[]
-            @test chebyshevpoints(T, 1, kind=1) == chebyshevpoints(T, 1, kind=2) == T[0]
+            @test chebyshevpoints(T, 0) == T[]
+            @test chebyshevpoints(T, 1) == T[0]
 
             n = 20
-            @inferred(chebyshevpoints(T, n, kind=1))
-            @inferred(chebyshevpoints(T, n, kind=2))
-            @test_throws ArgumentError chebyshevpoints(n, kind=-1)
+            @test @inferred(chebyshevpoints(T, n)) == [sinpi(convert(T,n-2k+1)/(2n)) for k=1:n]
+            @test @inferred(chebyshevpoints(T, n, Val(2))) == [sinpi(convert(T,n-2k+1)/(2n-2)) for k=1:n]
+
+            @test_throws MethodError chebyshevpoints(n, Val(-1))
+            @test_throws ArgumentError chebyshevpoints(T, 0, Val(2))
+            @test_throws ArgumentError chebyshevpoints(T, 1, Val(2))
         end
     end
 
     @testset "Chebyshev first kind points <-> first kind coefficients" begin
         for T in (Float32, Float64, ComplexF32, ComplexF64)
             n = 20
-            p_1 = chebyshevpoints(T, n, kind=1)
+            p_1 = chebyshevpoints(T, n)
             f = exp.(p_1)
-            f̌ = chebyshevtransform(f; kind=1)
+            g = @inferred(chebyshevtransform(f))
+            @test g == chebyshevtransform!(copy(f))
 
-            f̃ = x -> [cos(k*acos(x)) for k=0:n-1]' * f̌
+            f̃ = x -> [cos(k*acos(x)) for k=0:n-1]' * g
             @test f̃(0.1) ≈ exp(T(0.1))
-            @test ichebyshevtransform(f̌; kind=1) ≈ exp.(p_1)
-
-            f̃ = copy(f)
-            f̄ = copy(f̌)
-            P = plan_chebyshevtransform(f; kind=1)
-            @test P*f == f̌
-            @test f == f̃
-            P = plan_chebyshevtransform!(f; kind=1)
-            @test P*f == f̌
-            @test f == f̌
-            Pi = plan_ichebyshevtransform(f̌; kind=1)
-            @test Pi*f̌ ≈ f̃
-            @test f̌ == f̄
-            Pi = plan_ichebyshevtransform!(f̌; kind=1)
-            @test Pi*f̌ ≈ f̃
-            @test f̌ ≈ f̃
-
-            @test chebyshevtransform(T[1]; kind=1) == T[1]
-            @test ichebyshevtransform(T[1]; kind=1) == T[1]
-            @test chebyshevtransform(T[]; kind=1) == T[]
-            @test ichebyshevtransform(T[]; kind=1) == T[]
+            @test @inferred(ichebyshevtransform(g)) ≈ ichebyshevtransform!(copy(g)) ≈ exp.(p_1)
+
+            fcopy = copy(f)
+            gcopy = copy(g)
+            P = @inferred(plan_chebyshevtransform(f))
+            @test @inferred(P*f) == g
+            @test f == fcopy
+            @test_throws ArgumentError P * T[1,2]
+            P2 = @inferred(plan_chebyshevtransform(f, Val(1), 1:1))
+            @test @inferred(P2*f) == g
+            @test_throws ArgumentError P * T[1,2]
+
+            P = @inferred(plan_chebyshevtransform!(f))
+            @test @inferred(P*f) == g
+            @test f == g
+            @test_throws ArgumentError P * T[1,2]
+            f .= fcopy
+            P2 = @inferred(plan_chebyshevtransform!(f, 1:1))
+            @test @inferred(P2*f) == g
+            @test f == g
+            @test_throws ArgumentError P * T[1,2]
+
+            Pi = @inferred(plan_ichebyshevtransform(g))
+            @test @inferred(Pi*g) ≈ fcopy
+            @test g == gcopy
+            @test_throws ArgumentError Pi * T[1,2]
+            Pi2 = @inferred(plan_ichebyshevtransform(g, 1:1))
+            @test @inferred(Pi2*g) ≈ fcopy
+            @test g == gcopy
+            @test_throws ArgumentError Pi * T[1,2]
+
+            Pi = @inferred(plan_ichebyshevtransform!(g))
+            @test @inferred(Pi*g) ≈ fcopy
+            @test g ≈ fcopy
+            g .= gcopy
+            @test_throws ArgumentError Pi * T[1,2]
+            Pi2 = @inferred(plan_ichebyshevtransform!(g, 1:1))
+            @test @inferred(Pi2*g) ≈ fcopy
+            @test g ≈ fcopy
+            @test_throws ArgumentError Pi * T[1,2]
+
+            v = T[1]
+            @test chebyshevtransform(v) == v
+            @test ichebyshevtransform(v) == v
+            @test chebyshevtransform!(v) === v
+            @test ichebyshevtransform!(v) === v
+
+            v = T[]
+            @test chebyshevtransform(v) == v
+            @test ichebyshevtransform(v) == v
+            @test chebyshevtransform!(v) === v
+            @test ichebyshevtransform!(v) === v
         end
     end
     @testset "Chebyshev second kind points <-> first kind coefficients" begin
         for T in (Float32, Float64, ComplexF32, ComplexF64)
             n = 20
-            p_2 = chebyshevpoints(T, n, kind=2)
+            p_2 = chebyshevpoints(T, n, Val(2))
             f = exp.(p_2)
-            f̌ = chebyshevtransform(f; kind=2)
+            g = @inferred(chebyshevtransform(f, Val(2)))
+            @test g == chebyshevtransform!(copy(f), Val(2))
 
-            f̃ = x -> [cos(k*acos(x)) for k=0:n-1]' * f̌
+            f̃ = x -> [cos(k*acos(x)) for k=0:n-1]' * g
             @test f̃(0.1) ≈ exp(T(0.1))
-            @test ichebyshevtransform(f̌; kind=2) ≈ exp.(p_2)
+            @test @inferred(ichebyshevtransform(g, Val(2))) ≈ ichebyshevtransform!(copy(g), Val(2)) ≈ exp.(p_2)
 
-            P = plan_chebyshevtransform!(f; kind=2)
-            Pi = plan_ichebyshevtransform!(f; kind=2)
-            @test all((P \ copy(f)) .=== Pi * copy(f))
-            @test all((Pi \ copy(f̌)) .=== P * copy(f̌))
+            P = @inferred(plan_chebyshevtransform!(f, Val(2)))
+            Pi = @inferred(plan_ichebyshevtransform!(f, Val(2)))
+            @test all(@inferred(P \ copy(f)) .=== Pi * copy(f))
+            @test all(@inferred(Pi \ copy(g)) .=== P * copy(g))
             @test f ≈ P \ (P*copy(f)) ≈ P * (P\copy(f)) ≈ Pi \ (Pi*copy(f)) ≈ Pi * (Pi \ copy(f))
 
-            f̃ = copy(f)
-            f̄ = copy(f̌)
-            P = plan_chebyshevtransform(f; kind=2)
-            @test P*f == f̌
-            @test f == f̃
-            P = plan_chebyshevtransform!(f; kind=2)
-            @test P*f == f̌
-            @test f == f̌
-            Pi = plan_ichebyshevtransform(f̌; kind=2)
-            @test Pi*f̌ ≈ f̃
-            @test f̌ == f̄
-            Pi = plan_ichebyshevtransform!(f̌; kind=2)
-            @test Pi*f̌ ≈ f̃
-            @test f̌ ≈ f̃
-
-            @test chebyshevtransform(T[1]; kind=2) == T[1]
-            @test ichebyshevtransform(T[1]; kind=2) == T[1]
-            @test chebyshevtransform(T[]; kind=2) == T[]
-            @test ichebyshevtransform(T[]; kind=2) == T[]
+            fcopy = copy(f)
+            gcopy = copy(g)
+
+            P = @inferred(plan_chebyshevtransform(f, Val(2)))
+            @test P*f == g
+            @test f == fcopy
+            @test_throws ArgumentError P * T[1,2]
+            P = @inferred(plan_chebyshevtransform(f, Val(2), 1:1))
+            @test P*f == g
+            @test f == fcopy
+            @test_throws ArgumentError P * T[1,2]
+
+            P = @inferred(plan_chebyshevtransform!(f, Val(2)))
+            @test P*f == g
+            @test f == g
+            @test_throws ArgumentError P * T[1,2]
+            f .= fcopy
+            P = @inferred(plan_chebyshevtransform!(f, Val(2), 1:1))
+            @test P*f == g
+            @test f == g
+            @test_throws ArgumentError P * T[1,2]
+
+            Pi = @inferred(plan_ichebyshevtransform(g, Val(2)))
+            @test Pi*g ≈ fcopy
+            @test g == gcopy
+            @test_throws ArgumentError Pi * T[1,2]
+            Pi = @inferred(plan_ichebyshevtransform(g, Val(2), 1:1))
+            @test Pi*g ≈ fcopy
+            @test g == gcopy
+            @test_throws ArgumentError Pi * T[1,2]
+
+            Pi = @inferred(plan_ichebyshevtransform!(g, Val(2)))
+            @test Pi*g ≈ fcopy
+            @test g ≈ fcopy
+            @test_throws ArgumentError Pi * T[1,2]
+            g .= gcopy
+            Pi = @inferred(plan_ichebyshevtransform!(g, Val(2), 1:1))
+            @test Pi*g ≈ fcopy
+            @test g ≈ fcopy
+            @test_throws ArgumentError Pi * T[1,2]
+
+            @test_throws ArgumentError chebyshevtransform(T[1], Val(2))
+            @test_throws ArgumentError ichebyshevtransform(T[1], Val(2))
+            @test_throws ArgumentError chebyshevtransform(T[], Val(2))
+            @test_throws ArgumentError ichebyshevtransform(T[], Val(2))
         end
     end
 
     @testset "Chebyshev first kind points <-> second kind coefficients" begin
         for T in (Float32, Float64, ComplexF32, ComplexF64)
             n = 20
-            p_1 = chebyshevpoints(T, n, kind=1)
+            p_1 = chebyshevpoints(T, n)
             f = exp.(p_1)
-            f̌ = chebyshevutransform(f; kind=1)
+            g = @inferred(chebyshevutransform(f))
+            @test f ≈ exp.(p_1)
 
-            f̃ = x -> [sin((k+1)*acos(x))/sin(acos(x)) for k=0:n-1]' * f̌
+            f̃ = x -> [sin((k+1)*acos(x))/sin(acos(x)) for k=0:n-1]' * g
             @test f̃(0.1) ≈ exp(T(0.1))
-            @test ichebyshevutransform(f̌; kind=1) ≈ exp.(p_1)
-
-            f̃ = copy(f)
-            f̄ = copy(f̌)
-            P = plan_chebyshevutransform(f; kind=1)
-            @test P*f == f̌
-            @test f == f̃
-            P = plan_chebyshevutransform!(f; kind=1)
-            @test P*f == f̌
-            @test f == f̌
-            Pi = plan_ichebyshevutransform(f̌; kind=1)
-            @test Pi*f̌ ≈ f̃
-            @test f̌ == f̄
-            Pi = plan_ichebyshevutransform!(f̌; kind=1)
-            @test Pi*f̌ ≈ f̃
-            @test f̌ ≈ f̃
-
-            @test chebyshevutransform(T[1]; kind=1) == T[1]
-            @test ichebyshevutransform(T[1]; kind=1) == T[1]
-            @test chebyshevutransform(T[]; kind=1) == T[]
-            @test ichebyshevutransform(T[]; kind=1) == T[]
+            @test ichebyshevutransform(g) ≈ exp.(p_1)
+
+            fcopy = copy(f)
+            gcopy = copy(g)
+            P = @inferred(plan_chebyshevutransform(f))
+            @test P*f ≈ g
+            @test f ≈ fcopy
+            @test_throws ArgumentError P * T[1,2]
+            P = @inferred(plan_chebyshevutransform(f, 1:1))
+            @test P*f ≈ g
+            @test f ≈ fcopy
+            @test_throws ArgumentError P * T[1,2]
+
+            P = @inferred(plan_chebyshevutransform!(f))
+            @test P*f ≈ g
+            @test f ≈ g
+            @test_throws ArgumentError P * T[1,2]
+            f .= fcopy
+            P = @inferred(plan_chebyshevutransform!(f))
+            @test P*f ≈ g
+            @test f ≈ g
+            @test_throws ArgumentError P * T[1,2]
+
+            Pi = @inferred(plan_ichebyshevutransform(g))
+            @test Pi*g ≈ fcopy
+            @test g == gcopy
+            @test_throws ArgumentError Pi * T[1,2]
+            Pi = @inferred(plan_ichebyshevutransform(g, 1:1))
+            @test Pi*g ≈ fcopy
+            @test g == gcopy
+            @test_throws ArgumentError Pi * T[1,2]
+
+            Pi = @inferred(plan_ichebyshevutransform!(g))
+            @test Pi*g ≈ fcopy
+            @test g ≈ fcopy
+            @test_throws ArgumentError Pi * T[1,2]
+            g .= gcopy
+            Pi = @inferred(plan_ichebyshevutransform!(g))
+            @test Pi*g ≈ fcopy
+            @test g ≈ fcopy
+            @test_throws ArgumentError Pi * T[1,2]
+
+            v = T[1]
+            @test chebyshevutransform(v) == v
+            @test ichebyshevutransform(v) == v
+            @test chebyshevutransform!(v) === v
+            @test ichebyshevutransform!(v) === v
+
+            v = T[]
+            @test chebyshevutransform(v) == v
+            @test ichebyshevutransform(v) == v
+            @test chebyshevutransform!(v) === v
+            @test ichebyshevutransform!(v) === v
         end
     end
-
     @testset "Chebyshev second kind points <-> second kind coefficients" begin
         for T in (Float32, Float64, ComplexF32, ComplexF64)
             n = 20
-            p_2 = chebyshevpoints(T, n, kind=2)[2:end-1]
+            p_2 = chebyshevpoints(T, n, Val(2))[2:end-1]
             f = exp.(p_2)
-            f̌ = chebyshevutransform(f; kind=2)
+            g = @inferred(chebyshevutransform(f, Val(2)))
 
-            f̃ = x -> [sin((k+1)*acos(x))/sin(acos(x)) for k=0:n-3]' * f̌
+            f̃ = x -> [sin((k+1)*acos(x))/sin(acos(x)) for k=0:n-3]' * g
             @test f̃(0.1) ≈ exp(T(0.1))
-            @test ichebyshevutransform(f̌; kind=2) ≈ exp.(p_2)
-
-            f̃ = copy(f)
-            f̄ = copy(f̌)
-            P = plan_chebyshevutransform(f; kind=2)
-            @test P*f == f̌
-            @test f == f̃
-            P = plan_chebyshevutransform!(f; kind=2)
-            @test P*f == f̌
-            @test f == f̌
-            Pi = plan_ichebyshevutransform(f̌; kind=2)
-            @test Pi*f̌ ≈ f̃
-            @test f̌ == f̄
-            Pi = plan_ichebyshevutransform!(f̌; kind=2)
-            @test Pi*f̌ ≈ f̃
-            @test f̌ ≈ f̃
-
-            @test chebyshevutransform(T[1]; kind=2) == T[1]
-            @test ichebyshevutransform(T[1]; kind=2) == T[1]
-            @test chebyshevutransform(T[]; kind=2) == T[]
-            @test ichebyshevutransform(T[]; kind=2) == T[]
+            @test @inferred(ichebyshevutransform(g, Val(2))) ≈ f ≈ exp.(p_2)
+
+            fcopy = copy(f)
+            gcopy = copy(g)
+            P = @inferred(plan_chebyshevutransform(f, Val(2)))
+            @test @inferred(P*f) ≈ g
+            @test f ≈ fcopy
+            @test_throws ArgumentError P * T[1,2]
+            P = @inferred(plan_chebyshevutransform(f, Val(2), 1:1))
+            @test @inferred(P*f) ≈ g
+            @test f ≈ fcopy
+            @test_throws ArgumentError P * T[1,2]
+
+            P = @inferred(plan_chebyshevutransform!(f, Val(2)))
+            @test @inferred(P*f) ≈ g
+            @test f ≈ g
+            @test_throws ArgumentError P * T[1,2]
+            f .= fcopy
+            P = @inferred(plan_chebyshevutransform!(f, Val(2), 1:1))
+            @test @inferred(P*f) ≈ g
+            @test f ≈ g
+            @test_throws ArgumentError P * T[1,2]
+
+            Pi = @inferred(plan_ichebyshevutransform(g, Val(2)))
+            @test @inferred(Pi*g) ≈ fcopy
+            @test g ≈ gcopy
+            @test_throws ArgumentError Pi * T[1,2]
+
+            Pi = @inferred(plan_ichebyshevutransform!(g, Val(2)))
+            @test @inferred(Pi*g) ≈ fcopy
+            @test g ≈ fcopy
+            @test_throws ArgumentError Pi * T[1,2]
+            g .= gcopy
+            Pi = @inferred(plan_ichebyshevutransform!(g, Val(2)))
+            @test @inferred(Pi*g) ≈ fcopy
+            @test g ≈ fcopy
+            @test_throws ArgumentError Pi * T[1,2]
+
+            @test_throws ArgumentError chebyshevutransform(T[1], Val(2))
+            @test_throws ArgumentError ichebyshevutransform(T[1], Val(2))
+            @test_throws ArgumentError chebyshevutransform(T[], Val(2))
+            @test_throws ArgumentError ichebyshevutransform(T[], Val(2))
         end
     end
+
+    @testset "matrix" begin
+        X = randn(4,5)
+        @testset "chebyshevtransform" begin
+            @test @inferred(chebyshevtransform(X,1)) ≈ @inferred(chebyshevtransform!(copy(X),1)) ≈ hcat(chebyshevtransform.([X[:,k] for k=axes(X,2)])...)
+            @test chebyshevtransform(X,2) ≈ chebyshevtransform!(copy(X),2) ≈ hcat(chebyshevtransform.([X[k,:] for k=axes(X,1)])...)'
+            @test @inferred(chebyshevtransform(X,Val(2),1)) ≈ @inferred(chebyshevtransform!(copy(X),Val(2),1)) ≈ hcat(chebyshevtransform.([X[:,k] for k=axes(X,2)],Val(2))...)
+            @test chebyshevtransform(X,Val(2),2) ≈ chebyshevtransform!(copy(X),Val(2),2) ≈ hcat(chebyshevtransform.([X[k,:] for k=axes(X,1)],Val(2))...)'
+
+            @test @inferred(chebyshevtransform(X)) ≈ @inferred(chebyshevtransform!(copy(X))) ≈ chebyshevtransform(chebyshevtransform(X,1),2)
+            @test @inferred(chebyshevtransform(X,Val(2))) ≈ @inferred(chebyshevtransform!(copy(X),Val(2))) ≈ chebyshevtransform(chebyshevtransform(X,Val(2),1),Val(2),2)
+        end
+
+        @testset "ichebyshevtransform" begin
+            @test @inferred(ichebyshevtransform(X,1)) ≈ @inferred(ichebyshevtransform!(copy(X),1)) ≈ hcat(ichebyshevtransform.([X[:,k] for k=axes(X,2)])...)
+            @test ichebyshevtransform(X,2) ≈ ichebyshevtransform!(copy(X),2) ≈ hcat(ichebyshevtransform.([X[k,:] for k=axes(X,1)])...)'
+            @test @inferred(ichebyshevtransform(X,Val(2),1)) ≈ @inferred(ichebyshevtransform!(copy(X),Val(2),1)) ≈ hcat(ichebyshevtransform.([X[:,k] for k=axes(X,2)],Val(2))...)
+            @test ichebyshevtransform(X,Val(2),2) ≈ ichebyshevtransform!(copy(X),Val(2),2) ≈ hcat(ichebyshevtransform.([X[k,:] for k=axes(X,1)],Val(2))...)'
+
+            @test @inferred(ichebyshevtransform(X)) ≈ @inferred(ichebyshevtransform!(copy(X))) ≈ ichebyshevtransform(ichebyshevtransform(X,1),2)
+            @test @inferred(ichebyshevtransform(X,Val(2))) ≈ @inferred(ichebyshevtransform!(copy(X),Val(2))) ≈ ichebyshevtransform(ichebyshevtransform(X,Val(2),1),Val(2),2)
+
+            @test ichebyshevtransform(chebyshevtransform(X)) ≈ X
+            @test chebyshevtransform(ichebyshevtransform(X)) ≈ X
+        end
+
+        @testset "chebyshevutransform" begin
+            @test @inferred(chebyshevutransform(X,1)) ≈ @inferred(chebyshevutransform!(copy(X),1)) ≈ hcat(chebyshevutransform.([X[:,k] for k=axes(X,2)])...)
+            @test chebyshevutransform(X,2) ≈ chebyshevutransform!(copy(X),2) ≈ hcat(chebyshevutransform.([X[k,:] for k=axes(X,1)])...)'
+            @test @inferred(chebyshevutransform(X,Val(2),1)) ≈ @inferred(chebyshevutransform!(copy(X),Val(2),1)) ≈ hcat(chebyshevutransform.([X[:,k] for k=axes(X,2)],Val(2))...)
+            @test chebyshevutransform(X,Val(2),2) ≈ chebyshevutransform!(copy(X),Val(2),2) ≈ hcat(chebyshevutransform.([X[k,:] for k=axes(X,1)],Val(2))...)'
+
+            @test @inferred(chebyshevutransform(X)) ≈ @inferred(chebyshevutransform!(copy(X))) ≈ chebyshevutransform(chebyshevutransform(X,1),2)
+            @test @inferred(chebyshevutransform(X,Val(2))) ≈ @inferred(chebyshevutransform!(copy(X),Val(2))) ≈ chebyshevutransform(chebyshevutransform(X,Val(2),1),Val(2),2)
+        end
+
+        @testset "ichebyshevutransform" begin
+            @test @inferred(ichebyshevutransform(X,1)) ≈ @inferred(ichebyshevutransform!(copy(X),1)) ≈ hcat(ichebyshevutransform.([X[:,k] for k=axes(X,2)])...)
+            @test ichebyshevutransform(X,2) ≈ ichebyshevutransform!(copy(X),2) ≈ hcat(ichebyshevutransform.([X[k,:] for k=axes(X,1)])...)'
+            @test @inferred(ichebyshevutransform(X,Val(2),1)) ≈ @inferred(ichebyshevutransform!(copy(X),Val(2),1)) ≈ hcat(ichebyshevutransform.([X[:,k] for k=axes(X,2)],Val(2))...)
+            @test ichebyshevutransform(X,Val(2),2) ≈ ichebyshevutransform!(copy(X),Val(2),2) ≈ hcat(ichebyshevutransform.([X[k,:] for k=axes(X,1)],Val(2))...)'
+
+            @test @inferred(ichebyshevutransform(X)) ≈ @inferred(ichebyshevutransform!(copy(X))) ≈ ichebyshevutransform(ichebyshevutransform(X,1),2)
+            @test @inferred(ichebyshevutransform(X,Val(2))) ≈ @inferred(ichebyshevutransform!(copy(X),Val(2))) ≈ ichebyshevutransform(ichebyshevutransform(X,Val(2),1),Val(2),2)
+
+            @test ichebyshevutransform(chebyshevutransform(X)) ≈ X
+            @test chebyshevutransform(ichebyshevutransform(X)) ≈ X
+        end
+
+        X = randn(1,1)
+        @test chebyshevtransform!(copy(X), Val(1)) == ichebyshevtransform!(copy(X), Val(1)) == X
+        @test_throws ArgumentError chebyshevtransform!(copy(X), Val(2))
+        @test_throws ArgumentError ichebyshevtransform!(copy(X), Val(2))
+    end
+
+    @testset "tensor" begin
+        @testset "3D" begin
+            X = randn(4,5,6)
+            X̃ = similar(X)
+            @testset "chebyshevtransform" begin
+                for k = axes(X,2), j = axes(X,3) X̃[:,k,j] = chebyshevtransform(X[:,k,j]) end
+                @test @inferred(chebyshevtransform(X,1)) ≈ @inferred(chebyshevtransform!(copy(X),1)) ≈ X̃
+                for k = axes(X,1), j = axes(X,3) X̃[k,:,j] = chebyshevtransform(X[k,:,j]) end
+                @test chebyshevtransform(X,2) ≈ chebyshevtransform!(copy(X),2) ≈ X̃
+                for k = axes(X,1), j = axes(X,2) X̃[k,j,:] = chebyshevtransform(X[k,j,:]) end
+                @test chebyshevtransform(X,3) ≈ chebyshevtransform!(copy(X),3) ≈ X̃
+
+                for k = axes(X,2), j = axes(X,3) X̃[:,k,j] = chebyshevtransform(X[:,k,j],Val(2)) end
+                @test @inferred(chebyshevtransform(X,Val(2),1)) ≈ @inferred(chebyshevtransform!(copy(X),Val(2),1)) ≈ X̃
+                for k = axes(X,1), j = axes(X,3) X̃[k,:,j] = chebyshevtransform(X[k,:,j],Val(2)) end
+                @test chebyshevtransform(X,Val(2),2) ≈ chebyshevtransform!(copy(X),Val(2),2) ≈ X̃
+                for k = axes(X,1), j = axes(X,2) X̃[k,j,:] = chebyshevtransform(X[k,j,:],Val(2)) end
+                @test chebyshevtransform(X,Val(2),3) ≈ chebyshevtransform!(copy(X),Val(2),3) ≈ X̃
+
+                @test @inferred(chebyshevtransform(X)) ≈ @inferred(chebyshevtransform!(copy(X))) ≈ chebyshevtransform(chebyshevtransform(chebyshevtransform(X,1),2),3)
+                @test @inferred(chebyshevtransform(X,Val(2))) ≈ @inferred(chebyshevtransform!(copy(X),Val(2))) ≈ chebyshevtransform(chebyshevtransform(chebyshevtransform(X,Val(2),1),Val(2),2),Val(2),3)
+            end
+
+            @testset "ichebyshevtransform" begin
+                for k = axes(X,2), j = axes(X,3) X̃[:,k,j] = ichebyshevtransform(X[:,k,j]) end
+                @test @inferred(ichebyshevtransform(X,1)) ≈ @inferred(ichebyshevtransform!(copy(X),1)) ≈ X̃
+                for k = axes(X,1), j = axes(X,3) X̃[k,:,j] = ichebyshevtransform(X[k,:,j]) end
+                @test ichebyshevtransform(X,2) ≈ ichebyshevtransform!(copy(X),2) ≈ X̃
+                for k = axes(X,1), j = axes(X,2) X̃[k,j,:] = ichebyshevtransform(X[k,j,:]) end
+                @test ichebyshevtransform(X,3) ≈ ichebyshevtransform!(copy(X),3) ≈ X̃
+
+                for k = axes(X,2), j = axes(X,3) X̃[:,k,j] = ichebyshevtransform(X[:,k,j],Val(2)) end
+                @test @inferred(ichebyshevtransform(X,Val(2),1)) ≈ @inferred(ichebyshevtransform!(copy(X),Val(2),1)) ≈ X̃
+                for k = axes(X,1), j = axes(X,3) X̃[k,:,j] = ichebyshevtransform(X[k,:,j],Val(2)) end
+                @test ichebyshevtransform(X,Val(2),2) ≈ ichebyshevtransform!(copy(X),Val(2),2) ≈ X̃
+                for k = axes(X,1), j = axes(X,2) X̃[k,j,:] = ichebyshevtransform(X[k,j,:],Val(2)) end
+                @test ichebyshevtransform(X,Val(2),3) ≈ ichebyshevtransform!(copy(X),Val(2),3) ≈ X̃
+
+                @test @inferred(ichebyshevtransform(X)) ≈ @inferred(ichebyshevtransform!(copy(X))) ≈ ichebyshevtransform(ichebyshevtransform(ichebyshevtransform(X,1),2),3)
+                @test @inferred(ichebyshevtransform(X,Val(2))) ≈ @inferred(ichebyshevtransform!(copy(X),Val(2))) ≈ ichebyshevtransform(ichebyshevtransform(ichebyshevtransform(X,Val(2),1),Val(2),2),Val(2),3)
+
+                @test ichebyshevtransform(chebyshevtransform(X)) ≈ X
+                @test chebyshevtransform(ichebyshevtransform(X)) ≈ X
+            end
+        
+            @testset "chebyshevutransform" begin
+                for k = axes(X,2), j = axes(X,3) X̃[:,k,j] = chebyshevutransform(X[:,k,j]) end
+                @test @inferred(chebyshevutransform(X,1)) ≈ @inferred(chebyshevutransform!(copy(X),1)) ≈ X̃
+                for k = axes(X,1), j = axes(X,3) X̃[k,:,j] = chebyshevutransform(X[k,:,j]) end
+                @test chebyshevutransform(X,2) ≈ chebyshevutransform!(copy(X),2) ≈ X̃
+                for k = axes(X,1), j = axes(X,2) X̃[k,j,:] = chebyshevutransform(X[k,j,:]) end
+                @test chebyshevutransform(X,3) ≈ chebyshevutransform!(copy(X),3) ≈ X̃
+
+                for k = axes(X,2), j = axes(X,3) X̃[:,k,j] = chebyshevutransform(X[:,k,j],Val(2)) end
+                @test @inferred(chebyshevutransform(X,Val(2),1)) ≈ @inferred(chebyshevutransform!(copy(X),Val(2),1)) ≈ X̃
+                for k = axes(X,1), j = axes(X,3) X̃[k,:,j] = chebyshevutransform(X[k,:,j],Val(2)) end
+                @test chebyshevutransform(X,Val(2),2) ≈ chebyshevutransform!(copy(X),Val(2),2) ≈ X̃
+                for k = axes(X,1), j = axes(X,2) X̃[k,j,:] = chebyshevutransform(X[k,j,:],Val(2)) end
+                @test chebyshevutransform(X,Val(2),3) ≈ chebyshevutransform!(copy(X),Val(2),3) ≈ X̃
+
+                @test @inferred(chebyshevutransform(X)) ≈ @inferred(chebyshevutransform!(copy(X))) ≈ chebyshevutransform(chebyshevutransform(chebyshevutransform(X,1),2),3)
+                @test @inferred(chebyshevutransform(X,Val(2))) ≈ @inferred(chebyshevutransform!(copy(X),Val(2))) ≈ chebyshevutransform(chebyshevutransform(chebyshevutransform(X,Val(2),1),Val(2),2),Val(2),3)
+            end
+
+            @testset "ichebyshevutransform" begin
+                for k = axes(X,2), j = axes(X,3) X̃[:,k,j] = ichebyshevutransform(X[:,k,j]) end
+                @test @inferred(ichebyshevutransform(X,1)) ≈ @inferred(ichebyshevutransform!(copy(X),1)) ≈ X̃
+                for k = axes(X,1), j = axes(X,3) X̃[k,:,j] = ichebyshevutransform(X[k,:,j]) end
+                @test ichebyshevutransform(X,2) ≈ ichebyshevutransform!(copy(X),2) ≈ X̃
+                for k = axes(X,1), j = axes(X,2) X̃[k,j,:] = ichebyshevutransform(X[k,j,:]) end
+                @test ichebyshevutransform(X,3) ≈ ichebyshevutransform!(copy(X),3) ≈ X̃
+
+                for k = axes(X,2), j = axes(X,3) X̃[:,k,j] = ichebyshevutransform(X[:,k,j],Val(2)) end
+                @test @inferred(ichebyshevutransform(X,Val(2),1)) ≈ @inferred(ichebyshevutransform!(copy(X),Val(2),1)) ≈ X̃
+                for k = axes(X,1), j = axes(X,3) X̃[k,:,j] = ichebyshevutransform(X[k,:,j],Val(2)) end
+                @test ichebyshevutransform(X,Val(2),2) ≈ ichebyshevutransform!(copy(X),Val(2),2) ≈ X̃
+                for k = axes(X,1), j = axes(X,2) X̃[k,j,:] = ichebyshevutransform(X[k,j,:],Val(2)) end
+                @test ichebyshevutransform(X,Val(2),3) ≈ ichebyshevutransform!(copy(X),Val(2),3) ≈ X̃
+
+                @test @inferred(ichebyshevutransform(X)) ≈ @inferred(ichebyshevutransform!(copy(X))) ≈ ichebyshevutransform(ichebyshevutransform(ichebyshevutransform(X,1),2),3)
+                @test @inferred(ichebyshevutransform(X,Val(2))) ≈ @inferred(ichebyshevutransform!(copy(X),Val(2))) ≈ ichebyshevutransform(ichebyshevutransform(ichebyshevutransform(X,Val(2),1),Val(2),2),Val(2),3)
+
+                @test ichebyshevutransform(chebyshevutransform(X)) ≈ X
+                @test chebyshevutransform(ichebyshevutransform(X)) ≈ X
+            end
+
+            X = randn(1,1,1)
+            @test chebyshevtransform!(copy(X), Val(1)) == ichebyshevtransform!(copy(X), Val(1)) == X
+            @test_throws ArgumentError chebyshevtransform!(copy(X), Val(2))
+            @test_throws ArgumentError ichebyshevtransform!(copy(X), Val(2))
+        end
+
+        @testset "4D" begin
+            X = randn(2,3,4,5)
+            X̃ = similar(X)
+            for trans in (chebyshevtransform, ichebyshevtransform, chebyshevutransform, ichebyshevutransform)
+                for k = axes(X,2), j = axes(X,3), l = axes(X,4) X̃[:,k,j,l] = trans(X[:,k,j,l]) end
+                @test @inferred(trans(X,1)) ≈ X̃
+                @test @inferred(trans(X)) ≈ trans(trans(trans(trans(X,1),2),3),4)
+            end
+        end
+    end
+    @testset "Integer" begin
+        @test chebyshevtransform([1,2,3]) == chebyshevtransform([1.,2,3])
+        @test chebyshevtransform([1,2,3], Val(2)) == chebyshevtransform([1.,2,3], Val(2))
+        @test ichebyshevtransform([1,2,3]) == ichebyshevtransform([1.,2,3])
+        @test ichebyshevtransform([1,2,3], Val(2)) == ichebyshevtransform([1.,2,3], Val(2))
+
+        @test chebyshevutransform([1,2,3]) == chebyshevutransform([1.,2,3])
+        @test chebyshevutransform([1,2,3], Val(2)) == chebyshevutransform([1.,2,3], Val(2))
+        @test ichebyshevutransform([1,2,3]) == ichebyshevutransform([1.,2,3])
+        @test ichebyshevutransform([1,2,3], Val(2)) == ichebyshevutransform([1.,2,3], Val(2))
+    end
+
+    @testset "BigFloat" begin
+        x = BigFloat[1,2,3]
+        @test ichebyshevtransform(chebyshevtransform(x)) ≈ x
+        @test plan_chebyshevtransform(x)x ≈ chebyshevtransform(x)
+        @test plan_ichebyshevtransform(x)x ≈ ichebyshevtransform(x)
+        @test plan_chebyshevtransform!(x)copy(x) ≈ chebyshevtransform(x)
+        @test plan_ichebyshevtransform!(x)copy(x) ≈ ichebyshevtransform(x)
+    end
+    @testset "BigInt" begin
+        x = big(10)^400 .+ BigInt[1,2,3]
+        @test ichebyshevtransform(chebyshevtransform(x)) ≈ x
+    end
+
+    @testset "immutable vectors" begin
+        F = plan_chebyshevtransform([1.,2,3])
+        @test chebyshevtransform(1.0:3) == F * (1:3)
+        @test ichebyshevtransform(1.0:3) == ichebyshevtransform([1.0:3;])
+    end
+
+    @testset "inv" begin
+        x = randn(5)
+        for F in (plan_chebyshevtransform(x), plan_chebyshevtransform(x, Val(2)),
+                  plan_chebyshevutransform(x), plan_chebyshevutransform(x, Val(2)),
+                  plan_ichebyshevtransform(x), plan_ichebyshevtransform(x, Val(2)),
+                  plan_ichebyshevutransform(x), plan_ichebyshevutransform(x, Val(2)))
+            @test F \ (F*x) ≈ F * (F\x) ≈ x
+        end
+
+        X = randn(5,4)
+        for F in (plan_chebyshevtransform(X,Val(1),1), plan_chebyshevtransform(X, Val(2),1),
+            plan_chebyshevtransform(X,Val(1),2), plan_chebyshevtransform(X, Val(2),2),
+            plan_ichebyshevtransform(X,Val(1),1), plan_ichebyshevtransform(X, Val(2),1),
+            plan_ichebyshevtransform(X,Val(1),2), plan_ichebyshevtransform(X, Val(2),2))
+            @test F \ (F*X) ≈ F * (F\X) ≈ X
+        end
+        # Matrix isn't implemented for chebyshevu
+        for F in (plan_chebyshevutransform(X,Val(1),1), plan_chebyshevutransform(X, Val(2),1),
+            plan_chebyshevutransform(X,Val(1),2), plan_chebyshevutransform(X, Val(2),2),
+            plan_ichebyshevutransform(X,Val(1),1), plan_ichebyshevutransform(X, Val(2),1),
+            plan_ichebyshevutransform(X,Val(1),2), plan_ichebyshevutransform(X, Val(2),2))
+            @test F \ (F*X) ≈ F * (F\X) ≈ X
+        end
+    end
+
+    @testset "incompatible shapes" begin
+        @test_throws ErrorException plan_chebyshevtransform(randn(5)) * randn(5,5)
+        @test_throws ErrorException plan_ichebyshevtransform(randn(5)) * randn(5,5)
+    end
+
+    @testset "plan via size" begin
+        X = randn(3,4)
+        p = plan_chebyshevtransform(Float64, (3,4))
+        @test p * X == chebyshevtransform(X)
+    end
 end
diff --git a/test/fftBigFloattests.jl b/test/fftBigFloattests.jl
deleted file mode 100644
index d1f5a1fe..00000000
--- a/test/fftBigFloattests.jl
+++ /dev/null
@@ -1,109 +0,0 @@
-using FastTransforms, FFTW, Test
-
-@testset "BigFloat FFT and DCT" begin
-
-    c = collect(range(-big(1.0),stop=1,length=16))
-    @test norm(fft(c) - fft(Float64.(c))) < 3Float64(norm(c))*eps()
-    @test norm(ifft(c) - ifft(Float64.(c))) < 3Float64(norm(c))*eps()
-
-    c = collect(range(-big(1.0),stop=1.0,length=201))
-    @test norm(ifft(fft(c))-c) < 200norm(c)eps(BigFloat)
-
-    p = plan_dct(c)
-    @test norm(FastTransforms.generic_dct(c) - p*c) == 0
-
-    pi = plan_idct!(c)
-    @test norm(pi*dct(c) - c) < 1000norm(c)*eps(BigFloat)
-
-    @test norm(dct(c)-dct(map(Float64,c)),Inf) < 10eps()
-
-    cc = cis.(c)
-    @test norm(dct(cc)-dct(map(Complex{Float64},cc)),Inf) < 10eps()
-
-    c = big.(rand(100)) + im*big.(rand(100))
-    @test norm(dct(c)-dct(map(ComplexF64,c)),Inf) < 10eps()
-    @test norm(idct(c)-idct(map(ComplexF64,c)),Inf) < 10eps()
-    @test norm(idct(dct(c))-c,Inf) < 1000eps(BigFloat)
-    @test norm(dct(idct(c))-c,Inf) < 1000eps(BigFloat)
-
-    c = randn(ComplexF16, 20)
-    p = plan_fft(c)
-    @test inv(p) * (p * c) ≈ c
-
-    c = randn(ComplexF16, 20)
-    pinpl = plan_fft!(c)
-    @test inv(pinpl) * (pinpl * c) ≈ c
-
-    # Make sure we don't accidentally hijack any FFTW plans
-    for T in (Float32, Float64)
-        @test plan_fft(rand(BigFloat,10)) isa FastTransforms.DummyPlan
-        @test plan_fft(rand(BigFloat,10), 1:1) isa FastTransforms.DummyPlan
-        @test plan_fft(rand(Complex{BigFloat},10)) isa FastTransforms.DummyPlan
-        @test plan_fft(rand(Complex{BigFloat},10), 1:1) isa FastTransforms.DummyPlan
-        @test plan_fft!(rand(Complex{BigFloat},10)) isa FastTransforms.DummyPlan
-        @test plan_fft!(rand(Complex{BigFloat},10), 1:1) isa FastTransforms.DummyPlan
-        @test !( plan_fft(rand(T,10)) isa FastTransforms.DummyPlan )
-        @test !( plan_fft(rand(T,10), 1:1) isa FastTransforms.DummyPlan )
-        @test !( plan_fft(rand(Complex{T},10)) isa FastTransforms.DummyPlan )
-        @test !( plan_fft(rand(Complex{T},10), 1:1) isa FastTransforms.DummyPlan )
-        @test !( plan_fft!(rand(Complex{T},10)) isa FastTransforms.DummyPlan )
-        @test !( plan_fft!(rand(Complex{T},10), 1:1) isa FastTransforms.DummyPlan )
-
-        @test plan_ifft(rand(T,10)) isa FFTW.ScaledPlan
-        @test plan_ifft(rand(T,10), 1:1) isa FFTW.ScaledPlan
-        @test plan_ifft(rand(Complex{T},10)) isa FFTW.ScaledPlan
-        @test plan_ifft(rand(Complex{T},10), 1:1) isa FFTW.ScaledPlan
-        @test plan_ifft!(rand(Complex{T},10)) isa FFTW.ScaledPlan
-        @test plan_ifft!(rand(Complex{T},10), 1:1) isa FFTW.ScaledPlan
-
-        @test plan_bfft(rand(BigFloat,10)) isa FastTransforms.DummyPlan
-        @test plan_bfft(rand(BigFloat,10), 1:1) isa FastTransforms.DummyPlan
-        @test plan_bfft(rand(Complex{BigFloat},10)) isa FastTransforms.DummyPlan
-        @test plan_bfft(rand(Complex{BigFloat},10), 1:1) isa FastTransforms.DummyPlan
-        @test plan_bfft!(rand(Complex{BigFloat},10)) isa FastTransforms.DummyPlan
-        @test plan_bfft!(rand(Complex{BigFloat},10), 1:1) isa FastTransforms.DummyPlan
-        @test !( plan_bfft(rand(T,10)) isa FastTransforms.DummyPlan )
-        @test !( plan_bfft(rand(T,10), 1:1) isa FastTransforms.DummyPlan )
-        @test !( plan_bfft(rand(Complex{T},10)) isa FastTransforms.DummyPlan )
-        @test !( plan_bfft(rand(Complex{T},10), 1:1) isa FastTransforms.DummyPlan )
-        @test !( plan_bfft!(rand(Complex{T},10)) isa FastTransforms.DummyPlan )
-        @test !( plan_bfft!(rand(Complex{T},10), 1:1) isa FastTransforms.DummyPlan )
-
-        @test plan_dct(rand(BigFloat,10)) isa FastTransforms.DummyPlan
-        @test plan_dct(rand(BigFloat,10), 1:1) isa FastTransforms.DummyPlan
-        @test plan_dct(rand(Complex{BigFloat},10)) isa FastTransforms.DummyPlan
-        @test plan_dct(rand(Complex{BigFloat},10), 1:1) isa FastTransforms.DummyPlan
-        @test plan_dct!(rand(Complex{BigFloat},10)) isa FastTransforms.DummyPlan
-        @test plan_dct!(rand(Complex{BigFloat},10), 1:1) isa FastTransforms.DummyPlan
-        @test !( plan_dct(rand(T,10)) isa FastTransforms.DummyPlan )
-        @test !( plan_dct(rand(T,10), 1:1) isa FastTransforms.DummyPlan )
-        @test !( plan_dct(rand(Complex{T},10)) isa FastTransforms.DummyPlan )
-        @test !( plan_dct(rand(Complex{T},10), 1:1) isa FastTransforms.DummyPlan )
-        @test !( plan_dct!(rand(Complex{T},10)) isa FastTransforms.DummyPlan )
-        @test !( plan_dct!(rand(Complex{T},10), 1:1) isa FastTransforms.DummyPlan )
-
-        @test plan_idct(rand(BigFloat,10)) isa FastTransforms.DummyPlan
-        @test plan_idct(rand(BigFloat,10), 1:1) isa FastTransforms.DummyPlan
-        @test plan_idct(rand(Complex{BigFloat},10)) isa FastTransforms.DummyPlan
-        @test plan_idct(rand(Complex{BigFloat},10), 1:1) isa FastTransforms.DummyPlan
-        @test plan_idct!(rand(Complex{BigFloat},10)) isa FastTransforms.DummyPlan
-        @test plan_idct!(rand(Complex{BigFloat},10), 1:1) isa FastTransforms.DummyPlan
-        @test !( plan_idct(rand(T,10)) isa FastTransforms.DummyPlan )
-        @test !( plan_idct(rand(T,10), 1:1) isa FastTransforms.DummyPlan )
-        @test !( plan_idct(rand(Complex{T},10)) isa FastTransforms.DummyPlan )
-        @test !( plan_idct(rand(Complex{T},10), 1:1) isa FastTransforms.DummyPlan )
-        @test !( plan_idct!(rand(Complex{T},10)) isa FastTransforms.DummyPlan )
-        @test !( plan_idct!(rand(Complex{T},10), 1:1) isa FastTransforms.DummyPlan )
-
-        @test plan_rfft(rand(BigFloat,10)) isa FastTransforms.DummyPlan
-        @test plan_rfft(rand(BigFloat,10), 1:1) isa FastTransforms.DummyPlan
-        @test plan_brfft(rand(Complex{BigFloat},10), 19) isa FastTransforms.DummyPlan
-        @test plan_brfft(rand(Complex{BigFloat},10), 19, 1:1) isa FastTransforms.DummyPlan
-        @test !( plan_rfft(rand(T,10)) isa FastTransforms.DummyPlan )
-        @test !( plan_rfft(rand(T,10), 1:1) isa FastTransforms.DummyPlan )
-        @test !( plan_brfft(rand(Complex{T},10), 19) isa FastTransforms.DummyPlan )
-        @test !( plan_brfft(rand(Complex{T},10), 19, 1:1) isa FastTransforms.DummyPlan )
-
-    end
-
-end
diff --git a/test/gaunttests.jl b/test/gaunttests.jl
index d1f4699b..5f194eff 100644
--- a/test/gaunttests.jl
+++ b/test/gaunttests.jl
@@ -1,4 +1,4 @@
-using FastTransforms, Test
+using FastTransforms, LinearAlgebra, Test
 
 import FastTransforms: δ
 
diff --git a/test/grammatrixtests.jl b/test/grammatrixtests.jl
new file mode 100644
index 00000000..93b62039
--- /dev/null
+++ b/test/grammatrixtests.jl
@@ -0,0 +1,101 @@
+using FastTransforms, BandedMatrices, LazyArrays, LinearAlgebra, Test
+
+@testset "GramMatrix" begin
+    n = 128
+    for T in (Float32, Float64, BigFloat)
+        R = plan_leg2cheb(T, n; normcheb=true)*I
+        X = Tridiagonal([T(n)/(2n-1) for n in 1:n-1], zeros(T, n), [T(n)/(2n+1) for n in 1:n-1]) # Legendre X
+        W = GramMatrix(Symmetric(R'R), X)
+        @test issymmetric(W)
+        @test isposdef(W)
+        F = cholesky(W)
+        @test F.L*F.L' ≈ Symmetric(R'R)
+        @test F.U ≈ R
+
+        R = plan_leg2cheb(T, n; normcheb=true, normleg=true)*I
+        X = SymTridiagonal(zeros(T, n), [sqrt(T(n)^2/(4*n^2-1)) for n in 1:n-1]) # normalized Legendre X
+        W = GramMatrix(Symmetric(R'R), X)
+        F = cholesky(W)
+        @test F.L*F.L' ≈ Symmetric(R'R)
+        @test F.U ≈ R
+
+        b = 4
+        X = BandedMatrix(SymTridiagonal(zeros(T, n+b), [sqrt(T(n)^2/(4*n^2-1)) for n in 1:n+b-1])) # normalized Legendre X
+        M = Symmetric((I+X^2+X^4)[1:n, 1:n])
+        X = BandedMatrix(SymTridiagonal(zeros(T, n), [sqrt(T(n)^2/(4*n^2-1)) for n in 1:n-1])) # normalized Legendre X
+        W = GramMatrix(M, X)
+        @test bandwidths(W) == (b, b)
+        F = cholesky(W)
+        @test F.L*F.L' ≈ M
+
+        X = BandedMatrix(SymTridiagonal(T[2n-1 for n in 1:n+b], T[-n for n in 1:n+b-1])) # Laguerre X, tests nonzero diagonal
+        M = Symmetric((I+X^2+X^4)[1:n, 1:n])
+        X = BandedMatrix(SymTridiagonal(T[2n-1 for n in 1:n], T[-n for n in 1:n-1])) # Laguerre X, tests nonzero diagonal
+        W = GramMatrix(M, X)
+        @test bandwidths(W) == (b, b)
+        F = cholesky(W)
+        @test F.L*F.L' ≈ M
+
+        for μ in (PaddedVector([T(4)/3;0;-T(4)/15], 2n-1), # w(x) = 1-x^2
+                  PaddedVector([T(26)/15;0;-T(4)/105;0;T(16)/315], 2n-1), # w(x) = 1-x^2+x^4
+                  T(1) ./ (1:2n-1)) # Related to a log weight
+            X = Tridiagonal([T(n)/(2n-1) for n in 1:2n-2], zeros(T, 2n-1), [T(n)/(2n+1) for n in 1:2n-2]) # Legendre X
+            W = GramMatrix(μ, X)
+            X = Tridiagonal(X[1:n, 1:n])
+            G = FastTransforms.compute_skew_generators(W)
+            J = T[0 1; -1 0]
+            @test X'W-W*X ≈ G*J*G'
+        end
+    end
+    W = reshape([i for i in 1.0:n^2], n, n)
+    X = reshape([i for i in 1.0:4n^2], 2n, 2n)
+    @test_throws "different sizes" GramMatrix(W, X)
+    X = X[1:n, 1:n]
+    @test_throws "nonsymmetric" GramMatrix(W, X)
+    @test_throws "nontridiagonal" GramMatrix(Symmetric(W), X)
+end
+
+@testset "ChebyshevGramMatrix" begin
+    n = 128
+    for T in (Float32, Float64, BigFloat)
+        μ = FastTransforms.chebyshevmoments1(T, 2n-1)
+        W = ChebyshevGramMatrix(μ)
+        F = cholesky(W)
+        @test F.L*F.L' ≈ W
+        R = plan_cheb2leg(T, n; normleg=true)*I
+        @test F.U ≈ R
+
+        α, β = (T(0.123), T(0.456))
+        μ = FastTransforms.chebyshevjacobimoments1(T, 2n-1, α, β)
+        W = ChebyshevGramMatrix(μ)
+        F = cholesky(W)
+        @test F.L*F.L' ≈ W
+        R = plan_cheb2jac(T, n, α, β; normjac=true)*I
+        @test F.U ≈ R
+
+        μ = FastTransforms.chebyshevlogmoments1(T, 2n-1)
+        W = ChebyshevGramMatrix(μ)
+        F = cholesky(W)
+        @test F.L*F.L' ≈ W
+
+        μ = FastTransforms.chebyshevabsmoments1(T, 2n-1)
+        W = ChebyshevGramMatrix(μ)
+        F = cholesky(W)
+        @test F.L*F.L' ≈ W
+
+        μ = PaddedVector(T(1) ./ [1,2,3,4,5], 2n-1)
+        W = ChebyshevGramMatrix(μ)
+        @test bandwidths(W) == (4, 4)
+        F = cholesky(W)
+        @test F.L*F.L' ≈ W
+        μd = Vector{T}(μ)
+        Wd = ChebyshevGramMatrix(μd)
+        Fd = cholesky(Wd)
+        @test F.L ≈ Fd.L
+
+        X = Tridiagonal([T(1); ones(T, n-2)/2], zeros(T, n), ones(T, n-1)/2)
+        G = FastTransforms.compute_skew_generators(W)
+        J = T[0 1; -1 0]
+        @test 2*(X'W-W*X) ≈ G*J*G'
+    end
+end
diff --git a/test/hermitetests.jl b/test/hermitetests.jl
index c931d190..2e0a7f1a 100644
--- a/test/hermitetests.jl
+++ b/test/hermitetests.jl
@@ -7,7 +7,7 @@ hermitepoints(n) = FastGaussQuadrature.unweightedgausshermite( n )[1]
     @test hermitepoints(100_000)[end] ≈ 446.9720305443094
 
     @test weightedhermitetransform([1.0]) == [1.0]
-    @test weightedhermitetransform(exp.(-hermitepoints(2).^2/2)) == [1.0,0.0]
+    @test weightedhermitetransform(exp.(-hermitepoints(2).^2/2)) ≈ [1.0,0.0]
     @test weightedhermitetransform(exp.(-hermitepoints(3).^2/2)) ≈ [1.0,0.0,0.0]
     @test weightedhermitetransform(exp.(-hermitepoints(1000).^2/2)) ≈ [1.0; zeros(999)]
     @test weightedhermitetransform(exp.(-hermitepoints(3000).^2/2)) ≈ [1.0; zeros(2999)]
diff --git a/test/libfasttransformstests.jl b/test/libfasttransformstests.jl
index 00ca4f4f..545da7d9 100644
--- a/test/libfasttransformstests.jl
+++ b/test/libfasttransformstests.jl
@@ -1,148 +1,236 @@
 using FastTransforms, Test
 
-FastTransforms.set_num_threads(ceil(Int, Base.Sys.CPU_THREADS/2))
+FastTransforms.ft_set_num_threads(ceil(Int, Base.Sys.CPU_THREADS/2))
 
 @testset "libfasttransforms" begin
     n = 64
-    α, β, γ, δ, λ, μ = 0.1, 0.2, 0.3, 0.4, 0.5, 0.6
-    for T in (Float32, Float64, Complex{Float32}, Complex{Float64})
+    for T in (Float32, Float64)
+        c = one(T) ./ (1:n)
+        x = collect(-1 .+ 2*(0:n-1)/T(n))
+        f = similar(x)
+        @test FastTransforms.horner!(f, c, x) == f
+        fd = T[sum(c[k]*x^(k-1) for k in 1:length(c)) for x in x]
+        @test f ≈ fd
+        @test FastTransforms.clenshaw!(f, c, x) == f
+        fd = T[sum(c[k]*cos((k-1)*acos(x)) for k in 1:length(c)) for x in x]
+        @test f ≈ fd
+        A = T[(2k+one(T))/(k+one(T)) for k in 0:length(c)-1]
+        B = T[zero(T) for k in 0:length(c)-1]
+        C = T[k/(k+one(T)) for k in 0:length(c)]
+        phi0 = ones(T, length(x))
+        c = FastTransforms.lib_cheb2leg(c)
+        @test FastTransforms.clenshaw!(f, c, A, B, C, x, phi0) == f
+        @test f ≈ fd
+    end
+
+    α, β, γ, δ, λ, μ, ρ = 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7
+    function test_1d_plans(p1, p2, x)
+        y = p1*x
+        z = p2*y
+        @test z ≈ x
+        y = p1*view(x, :)
+        z = p2*view(y, :)
+        @test z ≈ x
+        y = p1*x
+        z = p1'y
+        y = transpose(p1)*z
+        z = transpose(p1)\y
+        y = p1'\z
+        z = p1\y
+        @test z ≈ x
+        y = p1*view(x, :)
+        z = p1'view(y, :)
+        y = transpose(p1)*view(z, :)
+        z = transpose(p1)\view(y, :)
+        y = p1'\view(z, :)
+        z = p1\view(y, :)
+        @test z ≈ x
+        y = p2*x
+        z = p2'y
+        y = transpose(p2)*z
+        z = transpose(p2)\y
+        y = p2'\z
+        z = p2\y
+        @test z ≈ x
+        y = p2*view(x, :)
+        z = p2'view(y, :)
+        y = transpose(p2)*view(z, :)
+        z = transpose(p2)\view(y, :)
+        y = p2'\view(z, :)
+        z = p2\view(y, :)
+        @test z ≈ x
+        P = p1*I
+        Q = p2*P
+        @test Q ≈ I
+        P = p1*I
+        Q = p1'P
+        P = transpose(p1)*Q
+        Q = transpose(p1)\P
+        P = p1'\Q
+        Q = p1\P
+        @test Q ≈ I
+        P = p2*I
+        Q = p2'P
+        P = transpose(p2)*Q
+        Q = transpose(p2)\P
+        P = p2'\Q
+        Q = p2\P
+        @test Q ≈ I
+    end
+
+    for T in (Float32, Float64, Complex{Float32}, Complex{Float64}, BigFloat, Complex{BigFloat})
         x = T(1)./(1:n)
         Id = Matrix{T}(I, n, n)
         for (p1, p2) in ((plan_leg2cheb(Id), plan_cheb2leg(Id)),
-                          (plan_ultra2ultra(Id, λ, μ), plan_ultra2ultra(Id, μ, λ)),
-                          (plan_jac2jac(Id, α, β, γ, δ), plan_jac2jac(Id, γ, δ, α, β)),
-                          (plan_lag2lag(Id, α, β), plan_lag2lag(Id, β, α)),
-                          (plan_jac2ultra(Id, α, β, λ), plan_ultra2jac(Id, λ, α, β)),
-                          (plan_jac2cheb(Id, α, β), plan_cheb2jac(Id, α, β)),
-                          (plan_ultra2cheb(Id, λ), plan_cheb2ultra(Id, λ)))
-            y = p1*x
-            z = p2*y
-            @test z ≈ x
-            y = p1*x
-            z = p1'y
-            y = transpose(p1)*z
-            z = transpose(p1)\y
-            y = p1'\z
-            z = p1\y
-            @test z ≈ x
-            y = p2*x
-            z = p2'y
-            y = transpose(p2)*z
-            z = transpose(p2)\y
-            y = p2'\z
-            z = p2\y
-            @test z ≈ x
-            P = p1*Id
-            Q = p2*P
-            @test Q ≈ Id
-            P = p1*Id
-            Q = p1'P
-            P = transpose(p1)*Q
-            Q = transpose(p1)\P
-            P = p1'\Q
-            Q = p1\P
-            @test Q ≈ Id
-            P = p2*Id
-            Q = p2'P
-            P = transpose(p2)*Q
-            Q = transpose(p2)\P
-            P = p2'\Q
-            Q = p2\P
-            @test Q ≈ Id
+                         (plan_ultra2ultra(Id, λ, μ), plan_ultra2ultra(Id, μ, λ)),
+                         (plan_jac2jac(Id, α, β, γ, δ), plan_jac2jac(Id, γ, δ, α, β)),
+                         (plan_lag2lag(Id, α, β), plan_lag2lag(Id, β, α)),
+                         (plan_jac2ultra(Id, α, β, λ), plan_ultra2jac(Id, λ, α, β)),
+                         (plan_jac2cheb(Id, α, β), plan_cheb2jac(Id, α, β)),
+                         (plan_ultra2cheb(Id, λ), plan_cheb2ultra(Id, λ)))
+            test_1d_plans(p1, p2, x)
         end
     end
 
-    for T in (BigFloat, Complex{BigFloat})
+    for T in (Float32, Float64, Complex{Float32}, Complex{Float64})
         x = T(1)./(1:n)
         Id = Matrix{T}(I, n, n)
-        for (p1, p2) in ((plan_leg2cheb(Id), plan_cheb2leg(Id)),
-                          (plan_ultra2ultra(Id, λ, μ), plan_ultra2ultra(Id, μ, λ)),
-                          (plan_jac2jac(Id, α, β, γ, δ), plan_jac2jac(Id, γ, δ, α, β)),
-                          (plan_lag2lag(Id, α, β), plan_lag2lag(Id, β, α)),
-                          (plan_jac2ultra(Id, α, β, λ), plan_ultra2jac(Id, λ, α, β)),
-                          (plan_jac2cheb(Id, α, β), plan_cheb2jac(Id, α, β)),
-                          (plan_ultra2cheb(Id, λ), plan_cheb2ultra(Id, λ)))
-            y = p1*x
-            z = p2*y
-            @test z ≈ x
-            y = p1*x
-            z = p1'y
-            y = transpose(p1)*z
-            z = transpose(p1)\y
-            y = p1'\z
-            z = p1\y
-            @test z ≈ x
-            y = p2*x
-            z = p2'y
-            y = transpose(p2)*z
-            z = transpose(p2)\y
-            y = p2'\z
-            z = p2\y
-            @test z ≈ x
-            P = p1*Id
-            Q = p2*P
-            @test_skip Q ≈ Id
-            P = p1*Id
-            Q = p1'P
-            P = transpose(p1)*Q
-            Q = transpose(p1)\P
-            P = p1'\Q
-            Q = p1\P
-            @test_skip Q ≈ Id
-            P = p2*Id
-            Q = p2'P
-            P = transpose(p2)*Q
-            Q = transpose(p2)\P
-            P = p2'\Q
-            Q = p2\P
-            @test_skip Q ≈ Id
-        end
+        p = plan_associatedjac2jac(Id, 1, α, β, γ, δ)
+        V = p*I
+        @test V ≈ p*Id
+        y = p*x
+        @test V\y ≈ x
     end
 
-    for T in (Float64, Complex{Float64})
-        A = T <: Real ? sphones(T, n, 2n-1) : sphones(T, n, 2n-1) + im*sphones(T, n, 2n-1)
-        p = plan_sph2fourier(A)
-        ps = plan_sph_synthesis(A)
-        pa = plan_sph_analysis(A)
-        B = copy(A)
-        C = ps*(p*A)
-        A = p\(pa*C)
-        @test A ≈ B
+    @testset "Modified classical orthonormal polynomial transforms" begin
+        (n, α, β) = (16, 0, 0)
+        for T in (Float32, Float64)
+            P1 = plan_modifiedjac2jac(T, n, α, β, T[0.9428090415820636, -0.32659863237109055, -0.42163702135578396, 0.2138089935299396]) # u1(x) = (1-x)^2*(1+x)
+            P2 = plan_modifiedjac2jac(T, n, α, β, T[0.9428090415820636, -0.32659863237109055, -0.42163702135578396, 0.2138089935299396], T[1.4142135623730951]) # u2(x) = (1-x)^2*(1+x)
+            P3 = plan_modifiedjac2jac(T, n, α, β, T[-0.9428090415820636, 0.32659863237109055, 0.42163702135578396, -0.2138089935299396], T[-5.185449728701348, 0.0, 0.42163702135578374]) # u3(x) = -(1-x)^2*(1+x), v3(x) = -(2-x)*(2+x)
+            P4 = plan_modifiedjac2jac(T, n, α+2, β+1, T[1.1547005383792517], T[4.387862045841156, 0.1319657758147716, -0.20865621238292037]) # v4(x) = (2-x)*(2+x)
 
-        A = T <: Real ? sphvones(T, n, 2n-1) : sphvones(T, n, 2n-1) + im*sphvones(T, n, 2n-1)
-        p = plan_sphv2fourier(A)
-        ps = plan_sphv_synthesis(A)
-        pa = plan_sphv_analysis(A)
-        B = copy(A)
-        C = ps*(p*A)
-        A = p\(pa*C)
-        @test A ≈ B
+            @test P1*I ≈ P2*I
+            @test P1\I ≈ P2\I
+            @test P3*I ≈ P2*(P4*I)
+            @test P3\I ≈ P4\(P2\I)
 
-        A = T <: Real ? diskones(T, n, 4n-3) : diskones(T, n, 4n-3) + im*diskones(T, n, 4n-3)
-        p = plan_disk2cxf(A)
-        ps = plan_disk_synthesis(A)
-        pa = plan_disk_analysis(A)
-        B = copy(A)
-        C = ps*(p*A)
-        A = p\(pa*C)
-        @test A ≈ B
+            P5 = plan_modifiedlag2lag(T, n, α, T[2.0, -4.0, 2.0]) # u5(x) = x^2
+            P6 = plan_modifiedlag2lag(T, n, α, T[2.0, -4.0, 2.0], T[1.0]) # u6(x) = x^2
+            P7 = plan_modifiedlag2lag(T, n, α, T[2.0, -4.0, 2.0], T[7.0, -7.0, 2.0]) # u7(x) = x^2, v7(x) = (1+x)*(2+x)
+            P8 = plan_modifiedlag2lag(T, n, α+2, T[sqrt(2.0)], T[sqrt(1058.0), -sqrt(726.0), sqrt(48.0)]) # v8(x) = (1+x)*(2+x)
 
-        A = T <: Real ? triones(T, n, n) : triones(T, n, n) + im*triones(T, n, n)
-        p = plan_tri2cheb(A, α, β, γ)
-        ps = plan_tri_synthesis(A)
-        pa = plan_tri_analysis(A)
-        B = copy(A)
-        C = ps*(p*A)
-        A = p\(pa*C)
-        @test A ≈ B
+            @test P5*I ≈ P6*I
+            @test P5\I ≈ P6\I
+            @test isapprox(P7*I, P6*(P8*I); rtol = eps(T)^(1/4))
+            @test isapprox(P7\I, P8\(P6\I); rtol = eps(T)^(1/4))
+
+            P9 = plan_modifiedherm2herm(T, n, T[2.995504568550877, 0.0, 3.7655850551068593, 0.0, 1.6305461589167827], T[2.995504568550877, 0.0, 3.7655850551068593, 0.0, 1.6305461589167827]) # u9(x) = 1+x^2+x^4, v9(x) = 1+x^2+x^4
 
-        A = T <: Real ? tetones(T, n, n, n) : tetones(T, n, n, n) + im*tetones(T, n, n, n)
-        p = plan_tet2cheb(A, α, β, γ, δ)
-        ps = plan_tet_synthesis(A)
-        pa = plan_tet_analysis(A)
+            @test P9*I ≈ P9\I
+        end
+    end
+
+    function test_nd_plans(p, ps, pa, A)
         B = copy(A)
         C = ps*(p*A)
         A = p\(pa*C)
         @test A ≈ B
+        C = ps'*(p'A)
+        A = p'\(pa'C)
+        @test A ≈ B
+        C = transpose(ps)*(transpose(p)*A)
+        A = transpose(p)\(transpose(pa)*C)
+        @test A ≈ B
     end
+
+    A = sphones(Float64, n, 2n-1)
+    p = plan_sph2fourier(A)
+    ps = plan_sph_synthesis(A)
+    pa = plan_sph_analysis(A)
+    test_nd_plans(p, ps, pa, A)
+    A = sphones(Float64, n, 2n-1) + im*sphones(Float64, n, 2n-1)
+    p = plan_sph2fourier(A)
+    ps = plan_sph_synthesis(A)
+    pa = plan_sph_analysis(A)
+    test_nd_plans(p, ps, pa, A)
+
+    A = sphvones(Float64, n, 2n-1)
+    p = plan_sphv2fourier(A)
+    ps = plan_sphv_synthesis(A)
+    pa = plan_sphv_analysis(A)
+    test_nd_plans(p, ps, pa, A)
+    A = sphvones(Float64, n, 2n-1) + im*sphvones(Float64, n, 2n-1)
+    p = plan_sphv2fourier(A)
+    ps = plan_sphv_synthesis(A)
+    pa = plan_sphv_analysis(A)
+    test_nd_plans(p, ps, pa, A)
+
+    A = diskones(Float64, n, 4n-3)
+    p = plan_disk2cxf(A, α, β)
+    ps = plan_disk_synthesis(A)
+    pa = plan_disk_analysis(A)
+    test_nd_plans(p, ps, pa, A)
+    A = diskones(Float64, n, 4n-3) + im*diskones(Float64, n, 4n-3)
+    p = plan_disk2cxf(A, α, β)
+    ps = plan_disk_synthesis(A)
+    pa = plan_disk_analysis(A)
+    test_nd_plans(p, ps, pa, A)
+
+    A = diskones(Float64, n, 4n-3)
+    p = plan_ann2cxf(A, α, β, 0, ρ)
+    ps = plan_annulus_synthesis(A, ρ)
+    pa = plan_annulus_analysis(A, ρ)
+    test_nd_plans(p, ps, pa, A)
+    A = diskones(Float64, n, 4n-3) + im*diskones(Float64, n, 4n-3)
+    p = plan_ann2cxf(A, α, β, 0, ρ)
+    ps = plan_annulus_synthesis(A, ρ)
+    pa = plan_annulus_analysis(A, ρ)
+    test_nd_plans(p, ps, pa, A)
+
+    A = rectdiskones(Float64, n, n)
+    p = plan_rectdisk2cheb(A, β)
+    ps = plan_rectdisk_synthesis(A)
+    pa = plan_rectdisk_analysis(A)
+    test_nd_plans(p, ps, pa, A)
+    A = rectdiskones(Float64, n, n) + im*rectdiskones(Float64, n, n)
+    p = plan_rectdisk2cheb(A, β)
+    ps = plan_rectdisk_synthesis(A)
+    pa = plan_rectdisk_analysis(A)
+    test_nd_plans(p, ps, pa, A)
+
+    A = triones(Float64, n, n)
+    p = plan_tri2cheb(A, α, β, γ)
+    ps = plan_tri_synthesis(A)
+    pa = plan_tri_analysis(A)
+    test_nd_plans(p, ps, pa, A)
+    A = triones(Float64, n, n) + im*triones(Float64, n, n)
+    p = plan_tri2cheb(A, α, β, γ)
+    ps = plan_tri_synthesis(A)
+    pa = plan_tri_analysis(A)
+    test_nd_plans(p, ps, pa, A)
+
+    α, β, γ, δ = -0.1, -0.2, -0.3, -0.4
+    A = tetones(Float64, n, n, n)
+    p = plan_tet2cheb(A, α, β, γ, δ)
+    ps = plan_tet_synthesis(A)
+    pa = plan_tet_analysis(A)
+    test_nd_plans(p, ps, pa, A)
+    A = tetones(Float64, n, n, n) + im*tetones(Float64, n, n, n)
+    p = plan_tet2cheb(A, α, β, γ, δ)
+    ps = plan_tet_synthesis(A)
+    pa = plan_tet_analysis(A)
+    test_nd_plans(p, ps, pa, A)
+
+    A = spinsphones(Complex{Float64}, n, 2n-1, 2) + im*spinsphones(Complex{Float64}, n, 2n-1, 2)
+    p = plan_spinsph2fourier(A, 2)
+    ps = plan_spinsph_synthesis(A, 2)
+    pa = plan_spinsph_analysis(A, 2)
+    test_nd_plans(p, ps, pa, A)
 end
+
+@testset "ultra2ulta bug and cheb2leg normalisation (#202, #203)" begin
+    @test ultra2ultra([0.0, 1.0], 1, 1) == [0,1]
+    @test cheb2leg([0.0, 1.0], normcheb=true) ≈ [0.,sqrt(2/π)]
+    @test cheb2leg([0.0, 1.0], normleg=true) ≈ [0.,sqrt(2/3)]
+end
\ No newline at end of file
diff --git a/test/nuffttests.jl b/test/nuffttests.jl
index 36fd6318..ee3c4689 100644
--- a/test/nuffttests.jl
+++ b/test/nuffttests.jl
@@ -1,4 +1,4 @@
-using FastTransforms, Test, FFTW
+using FFTW, FastTransforms, LinearAlgebra, Test
 
 FFTW.set_num_threads(ceil(Int, Sys.CPU_THREADS/2))
 
@@ -75,7 +75,7 @@ FFTW.set_num_threads(ceil(Int, Sys.CPU_THREADS/2))
         fftc = fft(c)
         if Sys.WORD_SIZE == 64
             @test_skip norm(nufft1(c, ω, ϵ) - fftc) == 0 # skip because fftw3 seems to change this
-            @test_skip norm(nufft2(c, x, ϵ) - fftc) == 0 # skip because fftw3 seems to change this
+            @test norm(nufft2(c, x, ϵ) - fftc) == 0
             @test_skip norm(nufft3(c, x, ω, ϵ) - fftc) == 0 # skip because fftw3 seems to change this
         end
         err_bnd = 500*eps(Float64)*norm(c)
diff --git a/test/paduatests.jl b/test/paduatests.jl
index cc46d462..c82dc579 100644
--- a/test/paduatests.jl
+++ b/test/paduatests.jl
@@ -53,4 +53,17 @@ using FastTransforms, Test
     g_l=paduaeval(g_xy,x,y,l,Val{false})
     @test f_xy(x,y) ≈ f_m
     @test g_xy(x,y) ≈ g_l
+
+    # odd n
+    m=135
+    l=85
+    f_m=paduaeval(f_xy,x,y,m,Val{true})
+    g_l=paduaeval(g_xy,x,y,l,Val{true})
+    @test f_xy(x,y) ≈ f_m
+    @test g_xy(x,y) ≈ g_l
+
+    f_m=paduaeval(f_xy,x,y,m,Val{false})
+    g_l=paduaeval(g_xy,x,y,l,Val{false})
+    @test f_xy(x,y) ≈ f_m
+    @test g_xy(x,y) ≈ g_l
 end
diff --git a/test/quadraturetests.jl b/test/quadraturetests.jl
index c79b346f..bceba48d 100644
--- a/test/quadraturetests.jl
+++ b/test/quadraturetests.jl
@@ -1,10 +1,10 @@
-using FastTransforms, Test
+using FastTransforms, LinearAlgebra, Test
 
 import FastTransforms: chebyshevmoments1, chebyshevmoments2,
                        chebyshevjacobimoments1, chebyshevjacobimoments2,
                        chebyshevlogmoments1, chebyshevlogmoments2
 
-@testset "Fejér and Clenshaw--Curtis quadrature" begin
+@testset "Fejér and Clenshaw–Curtis quadrature" begin
     N = 20
     f = x -> exp(x)
 
@@ -19,7 +19,7 @@ import FastTransforms: chebyshevmoments1, chebyshevmoments2,
 
     μ = chebyshevlogmoments1(Float64, N)
     w = clenshawcurtisweights(μ)
-    @test norm(sum(w./(x .- 3)) - π^2/12) ≤ 4eps()
+    @test norm(sum(w./(3 .- x)) - π^2/12) ≤ 4eps()
 
     x = fejernodes1(Float64, N)
     μ = chebyshevmoments1(Float64, N)
@@ -32,7 +32,7 @@ import FastTransforms: chebyshevmoments1, chebyshevmoments2,
 
     μ = chebyshevlogmoments1(Float64, N)
     w = fejerweights1(μ)
-    @test norm(sum(w./(x .- 3)) - π^2/12) ≤ 4eps()
+    @test norm(sum(w./(3 .- x)) - π^2/12) ≤ 4eps()
 
     x = fejernodes2(Float64, N)
     μ = chebyshevmoments2(Float64, N)
@@ -45,5 +45,5 @@ import FastTransforms: chebyshevmoments1, chebyshevmoments2,
 
     μ = chebyshevlogmoments2(Float64, N)
     w = fejerweights2(μ)
-    @test norm(sum(w./(x .- 3)) - π^2/12) ≤ 4eps()
+    @test norm(sum(w./(3 .- x)) - π^2/12) ≤ 4eps()
 end
diff --git a/test/runtests.jl b/test/runtests.jl
index 0d5c4592..36c95de8 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -1,25 +1,15 @@
-using FastTransforms, Libdl, LinearAlgebra, Test
+using FastTransforms, LinearAlgebra, Test
 
 include("specialfunctionstests.jl")
-
 include("chebyshevtests.jl")
-
 include("quadraturetests.jl")
-
-if find_library(FastTransforms.libfasttransforms) ≡ FastTransforms.libfasttransforms
-    include("libfasttransformstests.jl")
-else
-    error("FastTransforms is not properly installed. Please run Pkg.build(\"FastTransforms\") ",
-          "and restart Julia.")
-end
-
+include("libfasttransformstests.jl")
 include("nuffttests.jl")
-
-include("fftBigFloattests.jl")
 include("paduatests.jl")
-
 include("gaunttests.jl")
-
 include("hermitetests.jl")
-
-include("toeplitztests.jl")
+include("toeplitzplanstests.jl")
+include("toeplitzhankeltests.jl")
+include("toeplitzplushankeltests.jl")
+include("grammatrixtests.jl")
+include("arraystests.jl")
diff --git a/test/specialfunctionstests.jl b/test/specialfunctionstests.jl
index 014958a3..85a2e1f9 100644
--- a/test/specialfunctionstests.jl
+++ b/test/specialfunctionstests.jl
@@ -1,6 +1,6 @@
-using FastTransforms, Test
+using FastTransforms, LinearAlgebra, Test
 
-import FastTransforms: pochhammer, sqrtpi, SpecialFunctions.gamma
+import FastTransforms: pochhammer, sqrtpi, gamma, lgamma
 import FastTransforms: Cnλ, Λ, lambertw, Cnαβ, Anαβ
 import FastTransforms: chebyshevmoments1, chebyshevmoments2, chebyshevjacobimoments1, chebyshevjacobimoments2, chebyshevlogmoments1, chebyshevlogmoments2
 
@@ -36,4 +36,10 @@ import FastTransforms: chebyshevmoments1, chebyshevmoments2, chebyshevjacobimome
 
     @test norm(Cnαβ.(n,α,β) ./ Cnαβ.(n,big(α),big(β)) .- 1, Inf) < 3eps()
     @test norm(Anαβ.(n,α,β) ./ Anαβ.(n,big(α),big(β)) .- 1, Inf) < 4eps()
+
+    @testset "BigFloat bug" begin
+        @test Λ(0.0, -1/2, 1.0) ≈ -exp(lgamma(-1/2)-lgamma(1.0))
+        @test Λ(1.0, -1/2, 1.0) ≈ exp(lgamma(1-1/2)-lgamma(2.0))
+        @test Float64(Λ(big(0.0), -1/2, 1.0)) ≈ Λ(0.0, -1/2, 1.0)
+    end
 end
diff --git a/test/toeplitzhankeltests.jl b/test/toeplitzhankeltests.jl
new file mode 100644
index 00000000..0b8731bf
--- /dev/null
+++ b/test/toeplitzhankeltests.jl
@@ -0,0 +1,186 @@
+using FastTransforms, Test, Random
+import FastTransforms: th_leg2cheb, th_cheb2leg, th_leg2chebu, th_ultra2ultra,th_jac2jac, th_leg2chebu,
+                        lib_leg2cheb, lib_cheb2leg, lib_ultra2ultra, lib_jac2jac,
+                        plan_th_cheb2leg!, plan_th_leg2chebu!, plan_th_leg2cheb!, plan_th_ultra2ultra!, plan_th_jac2jac!,
+                        th_cheb2jac, th_jac2cheb
+
+Random.seed!(0)
+
+@testset "ToeplitzHankel" begin
+    for x in ([1.0], [1.0,2,3,4,5], [1.0+im,2-3im,3+4im,4-5im,5+10im], collect(1.0:1000))
+        @test th_leg2cheb(x) ≈ lib_leg2cheb(x)
+        @test th_cheb2leg(x) ≈ lib_cheb2leg(x)
+        @test th_leg2chebu(x) ≈ lib_ultra2ultra(x, 0.5, 1.0)
+        @test th_ultra2ultra(x,0.1, 0.2) ≈ lib_ultra2ultra(x, 0.1, 0.2)
+        @test th_ultra2ultra(x,1, 2) ≈ lib_ultra2ultra(x, 1, 2)
+        @test th_ultra2ultra(x,0.1, 2.2) ≈ lib_ultra2ultra(x, 0.1, 2.2)
+        @test th_ultra2ultra(x, 2.2, 0.1) ≈ lib_ultra2ultra(x, 2.2, 0.1)
+        @test th_ultra2ultra(x, 1, 3) ≈ lib_ultra2ultra(x, 1, 3)
+        @test @inferred(th_jac2jac(x,0.1, 0.2,0.1,0.4)) ≈ lib_jac2jac(x, 0.1, 0.2,0.1,0.4)
+        @test th_jac2jac(x,0.1, 0.2,0.3,0.2) ≈ lib_jac2jac(x, 0.1, 0.2,0.3,0.2)
+        @test th_jac2jac(x,0.1, 0.2,0.3,0.4) ≈ lib_jac2jac(x, 0.1, 0.2,0.3,0.4)
+        @test @inferred(th_jac2jac(x,0.1, 0.2,1.3,0.4)) ≈ lib_jac2jac(x, 0.1, 0.2,1.3,0.4)
+        @test th_jac2jac(x,0.1, 0.2,1.3,2.4) ≈ lib_jac2jac(x, 0.1, 0.2,1.3,2.4)
+        @test th_jac2jac(x,1.3,2.4, 0.1, 0.2) ≈ lib_jac2jac(x,1.3,2.4, 0.1, 0.2)
+        @test th_jac2jac(x,1.3, 1.2,-0.1,-0.2) ≈ lib_jac2jac(x, 1.3, 1.2,-0.1,-0.2)
+        @test @inferred(th_jac2jac(x,-0.5, -0.5, -0.5,-0.5)) ≈ lib_jac2jac(x, -0.5, -0.5, -0.5,-0.5)
+        @test th_jac2jac(x,-0.5, -0.5, 0.5,0.5) ≈ lib_jac2jac(x, -0.5, -0.5, 0.5,0.5)
+        @test th_jac2jac(x,0.5,0.5,-0.5, -0.5) ≈ lib_jac2jac(x, 0.5,0.5,-0.5, -0.5)
+        @test th_jac2jac(x,-0.5, -0.5, 0.5,-0.5) ≈ lib_jac2jac(x, -0.5, -0.5, 0.5,-0.5)
+        @test th_jac2jac(x, -1/2,-1/2,1/2,0) ≈ lib_jac2jac(x, -1/2,-1/2,1/2,0)
+        @test th_jac2jac(x, -1/2,-1/2,0,1/2) ≈ lib_jac2jac(x, -1/2,-1/2,0,1/2)
+        @test th_jac2jac(x, -3/4,-3/4,0,3/4) ≈ lib_jac2jac(x, -3/4,-3/4,0,3/4)
+        if length(x) < 10
+            @test th_jac2jac(x,0, 0, 5, 5) ≈ lib_jac2jac(x, 0, 0, 5, 5)
+            @test th_jac2jac(x, 5, 5, 0, 0) ≈ lib_jac2jac(x,  5, 5, 0, 0)
+        end
+
+        @test th_cheb2jac(x, 0.2, 0.3) ≈ cheb2jac(x, 0.2, 0.3)
+        @test th_jac2cheb(x, 0.2, 0.3) ≈ jac2cheb(x, 0.2, 0.3)
+        @test th_cheb2jac(x, 1, 1) ≈ cheb2jac(x, 1, 1)
+        @test th_jac2cheb(x, 1, 1) ≈ jac2cheb(x, 1, 1)
+
+        @test th_cheb2leg(th_leg2cheb(x)) ≈ x
+        @test th_leg2cheb(th_cheb2leg(x)) ≈ x
+        @test th_ultra2ultra(th_ultra2ultra(x, 0.1, 0.6), 0.6, 0.1) ≈ x
+        @test th_jac2jac(th_jac2jac(x, 0.1, 0.6, 0.1, 0.8), 0.1, 0.8, 0.1, 0.6) ≈ x
+        @test th_jac2jac(th_jac2jac(x, 0.1, 0.6, 0.2, 0.8), 0.2, 0.8, 0.1, 0.6) ≈ x
+    end
+
+    for X in (randn(5,4), randn(5,4) + im*randn(5,4))
+        @test th_leg2cheb(X, 1) ≈ hcat([leg2cheb(X[:,j]) for j=1:size(X,2)]...)
+        @test_broken th_leg2cheb(X, 1) ≈ leg2cheb(X, 1) # matrices not supported in FastTransforms
+        @test th_leg2cheb(X, 2) ≈ vcat([permutedims(leg2cheb(X[k,:])) for k=1:size(X,1)]...)
+        @test_broken th_leg2cheb(X, 2) ≈ leg2cheb(X, 2)
+        @test th_leg2cheb(X) ≈ th_leg2cheb(th_leg2cheb(X, 1), 2)
+        @test_broken th_leg2cheb(X) ≈ leg2cheb(X)
+
+        @test th_cheb2leg(X, 1) ≈ hcat([cheb2leg(X[:,j]) for j=1:size(X,2)]...)
+        @test th_cheb2leg(X, 2) ≈ vcat([permutedims(cheb2leg(X[k,:])) for k=1:size(X,1)]...)
+        @test th_cheb2leg(X) ≈ th_cheb2leg(th_cheb2leg(X, 1), 2)
+
+        @test th_cheb2leg(X) == plan_th_cheb2leg!(X, 1:2)*copy(X)
+        @test th_leg2cheb(X) == plan_th_leg2cheb!(X, 1:2)*copy(X)
+
+        @test th_leg2cheb(th_cheb2leg(X)) ≈ X
+
+        @test th_leg2chebu(X, 1) ≈ hcat([ultra2ultra(X[:,j], 0.5, 1.0) for j=1:size(X,2)]...)
+        @test th_leg2chebu(X, 2) ≈ vcat([permutedims(ultra2ultra(X[k,:], 0.5, 1.0)) for k=1:size(X,1)]...)
+        @test th_leg2chebu(X) ≈ th_leg2chebu(th_leg2chebu(X, 1), 2)
+
+        @test th_leg2chebu(X) == plan_th_leg2chebu!(X, 1:2)*copy(X)
+
+        @test th_ultra2ultra(X, 0.1, 0.6, 1) ≈ hcat([ultra2ultra(X[:,j], 0.1, 0.6) for j=1:size(X,2)]...)
+        @test th_ultra2ultra(X, 0.1, 0.6, 2) ≈ vcat([permutedims(ultra2ultra(X[k,:], 0.1, 0.6)) for k=1:size(X,1)]...)
+        @test th_ultra2ultra(X, 0.1, 0.6) ≈ th_ultra2ultra(th_ultra2ultra(X, 0.1, 0.6, 1), 0.1, 0.6, 2)
+
+        @test th_ultra2ultra(X, 0.1, 2.6, 1) ≈ hcat([ultra2ultra(X[:,j], 0.1, 2.6) for j=1:size(X,2)]...)
+        @test th_ultra2ultra(X, 0.1, 2.6, 2) ≈ vcat([permutedims(ultra2ultra(X[k,:], 0.1, 2.6)) for k=1:size(X,1)]...)
+        @test th_ultra2ultra(X, 0.1, 2.6) ≈ th_ultra2ultra(th_ultra2ultra(X, 0.1, 2.6, 1), 0.1, 2.6, 2)
+
+        @test th_ultra2ultra(X, 2.6, 0.1, 1) ≈ hcat([ultra2ultra(X[:,j], 2.6, 0.1) for j=1:size(X,2)]...)
+        @test th_ultra2ultra(X, 2.6, 0.1, 2) ≈ vcat([permutedims(ultra2ultra(X[k,:], 2.6, 0.1)) for k=1:size(X,1)]...)
+        @test th_ultra2ultra(X, 2.6, 0.1) ≈ th_ultra2ultra(th_ultra2ultra(X, 2.6, 0.1, 1), 2.6, 0.1, 2)
+
+        @test th_ultra2ultra(X, 0.1, 0.6) == plan_th_ultra2ultra!(X, 0.1, 0.6, 1:2)*copy(X)
+        @test th_ultra2ultra(X, 0.1, 0.6) == plan_th_ultra2ultra!(X, 0.1, 0.6, 1:2)*copy(X)
+
+        @test th_ultra2ultra(th_ultra2ultra(X, 0.1, 0.6), 0.6, 0.1) ≈ X
+
+        @test th_jac2jac(X, 0.1, 0.6, 0.1, 0.8, 1) ≈ hcat([jac2jac(X[:,j], 0.1, 0.6, 0.1, 0.8) for j=1:size(X,2)]...)
+        @test th_jac2jac(X, 0.1, 0.6, 0.1, 0.8, 2) ≈ vcat([permutedims(jac2jac(X[k,:], 0.1, 0.6, 0.1, 0.8)) for k=1:size(X,1)]...)
+        @test th_jac2jac(X, 0.1, 0.6, 0.1, 0.8) ≈ th_jac2jac(th_jac2jac(X, 0.1, 0.6, 0.1, 0.8, 1), 0.1, 0.6, 0.1, 0.8, 2)
+
+        @test th_jac2jac(X, 0.1, 0.6, 0.2, 0.8, 1) ≈ hcat([jac2jac(X[:,j], 0.1, 0.6, 0.2, 0.8) for j=1:size(X,2)]...)
+        @test th_jac2jac(X, 0.1, 0.6, 0.2, 0.8, 2) ≈ vcat([permutedims(jac2jac(X[k,:], 0.1, 0.6, 0.2, 0.8)) for k=1:size(X,1)]...)
+
+        @test th_jac2jac(X, 0.1, 0.6, 0.1, 0.8) == plan_th_jac2jac!(X, 0.1, 0.6, 0.1, 0.8, 1:2)*copy(X)
+        @test th_jac2jac(X, 0.1, 0.6, 0.1, 0.8) == plan_th_jac2jac!(X, 0.1, 0.6, 0.1, 0.8, 1:2)*copy(X)
+
+        @test th_jac2jac(th_jac2jac(X, 0.1, 0.6, 0.1, 0.8), 0.1, 0.8, 0.1, 0.6) ≈ X
+
+        @test th_jac2jac(X, 0.1, 0.6, 3.1, 2.8, 1) ≈ hcat([jac2jac(X[:,j], 0.1, 0.6, 3.1, 2.8) for j=1:size(X,2)]...)
+        @test th_jac2jac(X, 0.1, 0.6, 3.1, 2.8, 2) ≈ vcat([permutedims(jac2jac(X[k,:], 0.1, 0.6, 3.1, 2.8)) for k=1:size(X,1)]...)
+        @test th_jac2jac(X, 0.1, 0.6, 3.1, 2.8) ≈ th_jac2jac(th_jac2jac(X, 0.1, 0.6, 3.1, 2.8, 1), 0.1, 0.6, 3.1, 2.8, 2)
+
+        @test th_jac2jac(X, -0.5, -0.5, 3.1, 2.8, 1) ≈ hcat([jac2jac(X[:,j], -0.5, -0.5, 3.1, 2.8) for j=1:size(X,2)]...)
+        @test th_jac2jac(X, -0.5, -0.5, 3.1, 2.8, 2) ≈ vcat([permutedims(jac2jac(X[k,:], -0.5, -0.5, 3.1, 2.8)) for k=1:size(X,1)]...)
+        @test th_jac2jac(X, -0.5, -0.5, 3.1, 2.8) ≈ th_jac2jac(th_jac2jac(X, -0.5, -0.5, 3.1, 2.8, 1), -0.5, -0.5, 3.1, 2.8, 2)
+
+        @test th_cheb2jac(X, 3.1, 2.8, 1) ≈ hcat([cheb2jac(X[:,j], 3.1, 2.8) for j=1:size(X,2)]...)
+        @test th_cheb2jac(X, 3.1, 2.8, 2) ≈ vcat([permutedims(cheb2jac(X[k,:], 3.1, 2.8)) for k=1:size(X,1)]...)
+        @test th_cheb2jac(X, 3.1, 2.8) ≈ th_cheb2jac(th_cheb2jac(X, 3.1, 2.8, 1), 3.1, 2.8, 2)
+
+        @test th_jac2cheb(X, 3.1, 2.8, 1) ≈ hcat([jac2cheb(X[:,j], 3.1, 2.8) for j=1:size(X,2)]...)
+        @test th_jac2cheb(X, 3.1, 2.8, 2) ≈ vcat([permutedims(jac2cheb(X[k,:], 3.1, 2.8)) for k=1:size(X,1)]...)
+        @test th_jac2cheb(X, 3.1, 2.8) ≈ th_jac2cheb(th_jac2cheb(X, 3.1, 2.8, 1), 3.1, 2.8, 2)
+    end
+
+    @testset "BigFloat" begin
+        n = 10
+        x = big.(collect(1.0:n))
+        @test th_leg2cheb(x) ≈ lib_leg2cheb(x)
+        @test th_cheb2leg(x) ≈ lib_cheb2leg(x)
+    end
+
+    @testset "jishnub example" begin
+        x = chebyshevpoints(4096);
+        f = x -> cospi(1000x);  
+        y = f.(x);
+        v = th_cheb2leg(chebyshevtransform(y))
+        @test norm(v - th_cheb2leg(th_leg2cheb(v)), Inf) ≤ 1E-13
+        @test norm(v - th_cheb2leg(th_leg2cheb(v)))/norm(v) ≤ 1E-14
+    end
+
+    @testset "tensor" begin
+        X = randn(5,4,3)
+        for trans in (th_leg2cheb, th_cheb2leg)
+            Y = trans(X, 1)
+            for ℓ = 1:size(X,3)
+                @test Y[:,:,ℓ] ≈ trans(X[:,:,ℓ],1)
+            end
+            Y = trans(X, 2)
+            for ℓ = 1:size(X,3)
+                @test Y[:,:,ℓ] ≈ trans(X[:,:,ℓ],2)
+            end
+            Y = trans(X, 3)
+            for j = 1:size(X,2)
+                @test Y[:,j,:] ≈ trans(X[:,j,:],2)
+            end
+
+            Y = trans(X, (1,3))
+            for j = 1:size(X,2)
+                @test Y[:,j,:] ≈ trans(X[:,j,:])
+            end 
+
+            Y = trans(X, 1:3)
+            M = copy(X)
+            for j = 1:size(X,3)
+                M[:,:,j] = trans(M[:,:,j])
+            end
+            for k = 1:size(X,1), j=1:size(X,2)
+                M[k,j,:] = trans(M[k,j,:])
+            end
+            @test M ≈ Y
+        end
+    end
+
+    @testset "inv" begin
+        x = randn(10)
+        pl = plan_th_cheb2leg!(x)
+        @test size(pl) == (10,)
+        @test pl\(pl*x) ≈ x
+
+        X = randn(10,3)
+        for pl in (plan_th_cheb2leg!(X), plan_th_cheb2leg!(X, 1), plan_th_cheb2leg!(X, 2))
+            @test size(pl) == (10,3)
+            @test pl\(pl*copy(X)) ≈ X
+        end
+
+        X = randn(10,3,5)
+        for pl in (plan_th_cheb2leg!(X), plan_th_cheb2leg!(X, 1), plan_th_cheb2leg!(X, 2),  plan_th_cheb2leg!(X, 3))
+            @test size(pl) == (10,3,5)
+            @test pl\(pl*copy(X)) ≈ X
+        end
+    end
+end
\ No newline at end of file
diff --git a/test/toeplitzplanstests.jl b/test/toeplitzplanstests.jl
new file mode 100644
index 00000000..6ea6a095
--- /dev/null
+++ b/test/toeplitzplanstests.jl
@@ -0,0 +1,126 @@
+using FastTransforms, Test
+import FastTransforms: plan_uppertoeplitz!
+
+@testset "ToeplitzPlan" begin
+    @testset "Vector" begin
+        P = plan_uppertoeplitz!([1,2,3])
+        T = [1 2 3; 0 1 2; 0 0 1]
+        x = randn(3)
+        @test P * copy(x) ≈ T * x
+    end
+
+    @testset "Matrix" begin
+        T = [1 2 3; 0 1 2; 0 0 1]
+
+        X = randn(3,3)
+        P = plan_uppertoeplitz!([1,2,3], size(X), 1)
+        @test P * copy(X) ≈ T * X
+        P = plan_uppertoeplitz!([1,2,3], size(X), 2)
+        @test P * copy(X) ≈ X * T'
+
+        P = plan_uppertoeplitz!([1,2,3], size(X))
+        @test P * copy(X) ≈ T * X * T'
+
+        X = randn(3,4)
+        P1 = plan_uppertoeplitz!([1,2,3], size(X), 1)
+        @test P1 * copy(X) ≈ T * X
+        P2 = plan_uppertoeplitz!([1,2,3,4], size(X), 2)
+        T̃ = [1 2 3 4; 0 1 2 3; 0 0 1 2; 0 0 0 1]
+        @test P2 * copy(X) ≈ X * T̃'
+        P = plan_uppertoeplitz!([1,2,3,4], size(X))
+        @test P * copy(X) ≈ T * X * T̃'
+    end
+
+    @testset "Tensor" begin
+        T = [1 2 3; 0 1 2; 0 0 1]
+        
+        @testset "3D" begin
+            X = randn(3,3,3)
+            P = plan_uppertoeplitz!([1,2,3], size(X), 1)
+            PX = P * copy(X)
+            for ℓ = 1:size(X,3)
+                @test PX[:,:,ℓ] ≈ T*X[:,:,ℓ]
+            end
+
+            P = plan_uppertoeplitz!([1,2,3], size(X), 2)
+            PX = P * copy(X)
+            for ℓ = 1:size(X,3)
+                @test PX[:,:,ℓ] ≈ X[:,:,ℓ]*T'
+            end
+
+            P = plan_uppertoeplitz!([1,2,3], size(X), 3)
+            PX = P * copy(X)
+            for j = 1:size(X,2)
+                @test PX[:,j,:] ≈ X[:,j,:]*T'
+            end
+
+            P = plan_uppertoeplitz!([1,2,3], size(X), (1,3))
+            PX = P * copy(X)
+            for j = 1:size(X,2)
+                @test PX[:,j,:] ≈ T*X[:,j,:]*T'
+            end
+
+            P = plan_uppertoeplitz!([1,2,3], size(X), 1:3)
+            PX = P * copy(X)
+            M = copy(X)
+            for j = 1:size(X,3)
+                M[:,:,j] = T*M[:,:,j]*T'
+            end
+            for k = 1:size(X,1)
+                M[k,:,:] = M[k,:,:]*T'
+            end
+            @test M ≈ PX
+        end
+
+        @testset "4D" begin
+            X = randn(3,3,3,3)
+            P = plan_uppertoeplitz!([1,2,3], size(X), 1)
+            PX = P * copy(X)
+            for ℓ = 1:size(X,3), m = 1:size(X,4)
+                @test PX[:,:,ℓ,m] ≈ T*X[:,:,ℓ,m]
+            end
+
+            P = plan_uppertoeplitz!([1,2,3], size(X), 2)
+            PX = P * copy(X)
+            for ℓ = 1:size(X,3), m = 1:size(X,4)
+                @test PX[:,:,ℓ,m] ≈ X[:,:,ℓ,m]*T'
+            end
+
+            P = plan_uppertoeplitz!([1,2,3], size(X), 3)
+            PX = P * copy(X)
+            for j = 1:size(X,2), m = 1:size(X,4)
+                @test PX[:,j,:,m] ≈ X[:,j,:,m]*T'
+            end
+
+            P = plan_uppertoeplitz!([1,2,3], size(X), 4)
+            PX = P * copy(X)
+            for k = 1:size(X,1), j = 1:size(X,2)
+                @test PX[k,j,:,:] ≈ X[k,j,:,:]*T'
+            end
+
+            P = plan_uppertoeplitz!([1,2,3], size(X), (1,3))
+            PX = P * copy(X)
+            for j = 1:size(X,2), m=1:size(X,4)
+                @test PX[:,j,:,m] ≈ T*X[:,j,:,m]*T'
+            end
+
+            P = plan_uppertoeplitz!([1,2,3], size(X), 1:4)
+            PX = P * copy(X)
+            M = copy(X)
+            for ℓ = 1:size(X,3), m = 1:size(X,4)
+                M[:,:,ℓ,m] = T*M[:,:,ℓ,m]*T'
+            end
+            for k = 1:size(X,1), j = 1:size(X,2)
+                M[k,j,:,:] = T*M[k,j,:,:]*T'
+            end
+            @test M ≈ PX
+        end
+    end
+
+    @testset "BigFloat" begin
+        P = plan_uppertoeplitz!([big(π),2,3])
+        T = [big(π) 2 3; 0 big(π) 2; 0 0 big(π)]
+        x = randn(3)
+        @test P * copy(x) ≈ T * x
+    end
+end
\ No newline at end of file
diff --git a/test/toeplitzplushankeltests.jl b/test/toeplitzplushankeltests.jl
new file mode 100644
index 00000000..4e0c5633
--- /dev/null
+++ b/test/toeplitzplushankeltests.jl
@@ -0,0 +1,15 @@
+using FastTransforms, LinearAlgebra, Test
+
+import FastTransforms: normest
+
+@testset "ToeplitzPlusHankel" begin
+    n = 128
+    for T in (Float32, Float64)
+        μ = FastTransforms.chebyshevmoments1(T, 2n-1)
+        G = ChebyshevGramMatrix(μ)
+        TpH = ToeplitzPlusHankel(G)
+        @test TpH ≈ G
+        @test norm(TpH) ≤ normest(TpH)
+        @test normest(TpH) == normest(G)
+    end
+end
diff --git a/test/toeplitztests.jl b/test/toeplitztests.jl
deleted file mode 100644
index f11bd6d1..00000000
--- a/test/toeplitztests.jl
+++ /dev/null
@@ -1,26 +0,0 @@
-using FastTransforms, Test, ToeplitzMatrices
-
-@testset "BigFloat Toeplitz" begin
-    T = Toeplitz(BigFloat[1,2,3,4,5], BigFloat[1,6,7,8,0])
-    @test T*ones(BigFloat,5) ≈ [22,24,19,16,15]
-
-    let n = 512
-        r = map(BigFloat,rand(n))
-        T = Toeplitz(r,[r[1];map(BigFloat,rand(n-1))])
-        @test T*ones(BigFloat,n) ≈ Matrix(T)*ones(BigFloat,n)
-
-        T = TriangularToeplitz(BigFloat[1,2,3,4,5],:L)
-        @test T*ones(BigFloat,5) ≈ Matrix(T)*ones(BigFloat,5)
-
-        r = map(BigFloat,rand(n))
-        T = TriangularToeplitz(r,:L)
-        @test T*ones(BigFloat,n) ≈ Matrix(T)*ones(BigFloat,n)
-
-        T = TriangularToeplitz(BigFloat[1,2,3,4,5],:U)
-        @test T*ones(BigFloat,5) ≈ Matrix(T)*ones(BigFloat,5)
-
-        r = map(BigFloat,rand(n))
-        T = TriangularToeplitz(r,:U)
-        @test T*ones(BigFloat,n) ≈ Matrix(T)*ones(BigFloat,n)
-    end
-end